In [25]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as dt
import pandas as pd 
import numpy as np

# Input and Hyperparameters #

In [26]:
data_path = "./US_length.csv"
learning_rate = 0.00001
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
split = 0.8
batch_size = 100
num_epochs = 1

# Data Cleaning #

In [27]:
df_main = pd.read_csv(data_path, sep = ',')
#subset data for testing ,use the first 10000
df = df_main[:10000]
secs = df['sec'].values
mins = df['min'].values
hours = df['hour'].values
total_time = [s+60*m+3600*h for s,m,h in zip(secs, mins, hours)]
df['total_time'] = total_time
df_features=df[['title', 'total_time']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [28]:
df_features['title_upper_count'] = df_features['title'].str.findall(r'[A-Z]').str.len()
df_features['tag_count'] = [len(s.split('|')) for s in df['tags'].values]
df_features['category_id'] = df_main['category_id']
df_label = pd.DataFrame()
df_label['views'] = df['views']
df_features = df_features.drop(columns = 'title')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [29]:
df_features.head()


Unnamed: 0,total_time,title_upper_count,tag_count,category_id
0,678,28,1,22
1,1431,11,4,24
2,569,8,23,23
3,430,4,27,24
4,733,12,14,24


In [30]:
df_label.head()


Unnamed: 0,views
0,748374
1,2418783
2,3191434
3,343168
4,2095731


# Setup Dataloader #

In [35]:
tv_split = int(len(df_features)*split)
train_set = df_features[:tv_split]
train_label = df_label[:tv_split]
validation_set = df_features[tv_split:]
validation_label = df_label[tv_split:]


In [36]:
class Dataset(dt.Dataset):
    def __init__(self, feature, labels):
        self.labels = labels
        self.feature = feature
    def __len__(self):
        return(len(self.feature))
    def __getitem__(self, idx):
        cur_feature = self.feature.iloc[idx]
        cur_feature = np.array(cur_feature)
        cur_label = self.labels.iloc[idx]
        cur_label = np.array(cur_label)
        sample = (cur_feature, cur_label)
        return sample

In [37]:
train_loader = Dataset(train_set, train_label)
test_loader = Dataset(validation_set, validation_label)

# Define Neural Network

In [38]:
class NeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super().__init__()
        self.fc1 = nn.Linear(input_size, hidden_size).double() 
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes).double()
        self.sigmoid = nn.Sigmoid()
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out

# Main #

In [39]:
input_size = len(df_features.iloc[0])
output_size = len(df_label.columns)
hidden_size = int(np.ceil((input_size+output_size)/2))
model = NeuralNet(input_size, hidden_size, output_size).to(device)
# Loss and optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)  
# Train the model
total_step = len(train_loader)
for epoch in range(num_epochs):
    for i, (feature, labels) in enumerate(train_loader):  
        # Move tensors to the configured device
        #print(feature, labels)
        feature = torch.tensor(feature,  dtype = torch.float64).to(device)
        label = torch.tensor(labels,  dtype = torch.float64).to(device)

        # Forward pass/
        outputs = model(feature)
        #print(outputs, label)
        loss = criterion(outputs, label)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (i+1) % 100 == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
                   .format(epoch+1, num_epochs, i+1, total_step, loss.item()))

# Test the model

Epoch [1/1], Step [100/8000], Loss: 38292619224.9929
Epoch [1/1], Step [200/8000], Loss: 314237603761.0000
Epoch [1/1], Step [300/8000], Loss: 56840758568.9998
Epoch [1/1], Step [400/8000], Loss: 43130151684.0000
Epoch [1/1], Step [500/8000], Loss: 102270121208.9994
Epoch [1/1], Step [600/8000], Loss: 13546374691600.0000
Epoch [1/1], Step [700/8000], Loss: 1222761022.8561
Epoch [1/1], Step [800/8000], Loss: 39024980.9099
Epoch [1/1], Step [900/8000], Loss: 482156640615.7137
Epoch [1/1], Step [1000/8000], Loss: 11728144493.9509
Epoch [1/1], Step [1100/8000], Loss: 1176009849.0000
Epoch [1/1], Step [1200/8000], Loss: 12338099929.0000
Epoch [1/1], Step [1300/8000], Loss: 2822371792.1150
Epoch [1/1], Step [1400/8000], Loss: 834058439824.0000
Epoch [1/1], Step [1500/8000], Loss: 7027804123.7329
Epoch [1/1], Step [1600/8000], Loss: 35557265.0928
Epoch [1/1], Step [1700/8000], Loss: 109297021201.0000
Epoch [1/1], Step [1800/8000], Loss: 9507781073961.0000
Epoch [1/1], Step [1900/8000], Loss: 

In [40]:
# for i, (feature, labels) in enumerate(train_loader):  
#     if(i >10):
#         break
#     print((type(feature)))
#     print(labels)

<class 'numpy.ndarray'>
[748374]
<class 'numpy.ndarray'>
[2418783]
<class 'numpy.ndarray'>
[3191434]
<class 'numpy.ndarray'>
[343168]
<class 'numpy.ndarray'>
[2095731]
<class 'numpy.ndarray'>
[119180]
<class 'numpy.ndarray'>
[2103417]
<class 'numpy.ndarray'>
[817732]
<class 'numpy.ndarray'>
[826059]
<class 'numpy.ndarray'>
[256426]
<class 'numpy.ndarray'>
[81377]
