# Sentiment Analysis on facebook comments

### Importing required libraries

In [1]:
#Data manipulation
import pandas as pd
import numpy as np

#sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


#PyTorch
import torch 
import torch.nn as nn 
import torch.nn.functional as F 
from torch.utils.data import TensorDataset, DataLoader
import torch.optim as optim

### Mounting google drive and importing data to pandas dataframe

In [2]:
#from google.colab import drive
#drive.mount('/content/drive')

df_train=pd.read_csv("/content/drive/My Drive/facebook_comments.csv", header=None, names=['text','sentiment'], encoding='iso-8859-1',lineterminator='\n')
df_train.head()

Unnamed: 0,text,sentiment
0,Heres a single to add to Kindle. Just read t...,neutral
1,If you tire of Non-Fiction.. Check out http://...,neutral
2,Ghost of Round Island is supposedly nonfiction.,neutral
3,Why is Barnes and Nobles version of the Kindle...,negative
4,@Maria: Do you mean the Nook? Be careful bo...,positive


### Creating new labels column

In [3]:
sent={'positive':2,'neutral':1,'negative':0}
df_train['labels']=df_train['sentiment'].str.strip().map(sent)
df_train.head()

Unnamed: 0,text,sentiment,labels
0,Heres a single to add to Kindle. Just read t...,neutral,1
1,If you tire of Non-Fiction.. Check out http://...,neutral,1
2,Ghost of Round Island is supposedly nonfiction.,neutral,1
3,Why is Barnes and Nobles version of the Kindle...,negative,0
4,@Maria: Do you mean the Nook? Be careful bo...,positive,2


In [4]:
training_texts = df_train.text.values
labels = df_train.labels.values
print(labels.shape)

(1999,)


### Preprocessing data

In [5]:
#Created a vector of 500 input features
vectorizer=TfidfVectorizer(stop_words='english', max_features=500)
instances = vectorizer.fit_transform(training_texts)
X=instances
Y=labels

print(X.shape)
print(Y.shape)

(1999, 500)
(1999,)


### Random Forest Model for Classification

In [6]:

kfold = KFold(n_splits=10, shuffle=True, random_state = 1234)
rf_model = RandomForestClassifier(criterion='entropy', max_depth=2, random_state=1234)
rf_cvscores=[]

for train_idx,test_idx in kfold.split(X):
  rf_model.fit(X[train_idx],Y[train_idx])
  acc = rf_model.score(X[test_idx],Y[test_idx])
  rf_cvscores.append(acc)

print("Random Forest - mean: %.4f%% (std: +/- %.4f%%)"% (np.mean(rf_cvscores)*100,np.std(rf_cvscores)*100))

Random Forest - mean: 64.1304% (std: +/- 2.8070%)


### Fully Connected Feed forward network

In [7]:
#Hyperparameters
epochs = 75
lr = 1e-3
indim = X.shape[1]
outdim = 3
drate = 0.7
batch_size = 16

#Created tensor objects
X_tensor = torch.from_numpy(X.toarray())
Y_tensor = torch.from_numpy(Y)

#Created tensor dataset
dataset = TensorDataset(X_tensor,Y_tensor)

#Splitting data into training and validation
train_size = int(0.8*len(dataset))
val_size =  len(dataset)-train_size
train_dataset,val_dataset = torch.utils.data.random_split(dataset,[train_size,val_size])

#Created DataLoader objects for training and validation
train_loader = DataLoader(train_dataset, batch_size= batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size= batch_size, shuffle=True)

### FFN Model

In [8]:
class SentimetNetwork(nn.Module):
  def __init__(self, input_dim, output_dim, dropout_rate):
    super(SentimetNetwork,self).__init__()
    self.fc1=nn.Linear(input_dim,1024)      #input layer
    self.fc2=nn.Linear(1024,512)            #hidden layer 1
    self.fc3=nn.Linear(512,256)             #hidden layer 2
    self.fc4=nn.Linear(256,128)             #hidden layer 3
    self.fc5=nn.Linear(128,64)              #hidden layer 4
    self.fc6=nn.Linear(64,outdim)           #output layer 

    self.do1 = nn.Dropout(p=dropout_rate, inplace=False)  #Dropout1 
    self.do2 = nn.Dropout(p=dropout_rate, inplace=False)  #Dropout2
    self.do3 = nn.Dropout(p=dropout_rate, inplace=False)  #Dropout3
    self.do4 = nn.Dropout(p=dropout_rate, inplace=False)  #Dropout4
    

  def forward(self,x):
    x = F.relu(self.fc1(x))       #Using relu as activation function
    x = self.do1(x)
    x = F.relu(self.fc2(x))
    x = self.do2(x)
    x = F.relu(self.fc3(x))
    x = self.do3(x)
    x = F.relu(self.fc4(x))
    x = self.do4(x)
    x = F.relu(self.fc5(x))
    x = self.fc6(x)

    return F.softmax(x)           

In [9]:
#FFN model
model = SentimetNetwork(indim, outdim, drate)
print(model)

SentimetNetwork(
  (fc1): Linear(in_features=500, out_features=1024, bias=True)
  (fc2): Linear(in_features=1024, out_features=512, bias=True)
  (fc3): Linear(in_features=512, out_features=256, bias=True)
  (fc4): Linear(in_features=256, out_features=128, bias=True)
  (fc5): Linear(in_features=128, out_features=64, bias=True)
  (fc6): Linear(in_features=64, out_features=3, bias=True)
  (do1): Dropout(p=0.7, inplace=False)
  (do2): Dropout(p=0.7, inplace=False)
  (do3): Dropout(p=0.7, inplace=False)
  (do4): Dropout(p=0.7, inplace=False)
)


In [10]:
#Defined optimizer and loss_function  
optimizer = torch.optim.Adam(model.parameters(), lr=lr) 
criterion = nn.CrossEntropyLoss() 

In [11]:
#Defined a function to calculate accuracy
def accuracy(y, y_pred):
  pred = torch.argmax(y_pred, dim=1)
  pred1 = pred.detach().numpy() #converting tensor object to numpy object
  y1 = y.detach().numpy() #converting tensor object to numpy object
  return accuracy_score(y1,pred1, normalize=True)

#Defined a training process function
def train(model, train_loader, optimizer, criterion):
  
  epoch_loss, epoch_acc = 0.0,0.0 # the loss and accuracy for each epoch
  model.train()
  
  for batch_x, batch_y in train_loader:
    # Forward pass
    y_pred = model(batch_x.float())
    loss = criterion(y_pred, batch_y)
    acc = accuracy(batch_y, y_pred)
    
    # Backward and optimize
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    #Calculate Loss and Accuray for each epoch and aggregate it
    epoch_loss += loss.item()
    epoch_acc += acc
    
    #Calcultae mean of epoch_loss and epoch_acc
    epoch_acc_mean = epoch_acc/len(train_loader) #len(train_loader)~0.8*1999/batch_size
    epoch_loss_mean = epoch_loss/len(train_loader) 

  return epoch_loss_mean, epoch_acc_mean

# defined a validation/evaluation process function
def evaluate(model, val_loader, criterion):
  epoch_loss, epoch_acc = 0.0,0.0 # the loss and accuracy for each epoch
  model.eval()
  
  with torch.no_grad(): #we dont optimize in validation function
    for batch_x, batch_y in val_loader:
        y_pred = model(batch_x.float()) 
        loss = criterion(y_pred, batch_y)
        acc = accuracy(batch_y,y_pred)

        #Calculate Loss and Accuray for each epoch and aggregate it
        epoch_loss += loss.item()
        epoch_acc += acc 
        
        #Calcultae mean of epoch_loss and epoch_acc
        epoch_loss_mean = epoch_loss/len(val_loader) #len(val_loader)~0.2*1999/batch_size
        epoch_acc_mean = epoch_acc/len(val_loader)

  return epoch_loss_mean, epoch_acc_mean

In [12]:
#real training and evaluation process

for epoch in range(epochs):
  train_loss, train_acc = train(model, train_loader, optimizer, criterion)
  valid_loss, valid_acc =  evaluate(model, val_loader, criterion)

  print(f'epoch: {epoch+1:02}')
  print(f'\tTrain Loss: {train_loss: .4f} | Train Acc: {train_acc: .4f}') 
  print(f'\tVal Loss: {valid_loss: .4f} | Val Acc: {valid_acc: .4f}')




epoch: 01
	Train Loss:  0.9416 | Train Acc:  0.6423
	Val Loss:  0.9134 | Val Acc:  0.6375
epoch: 02
	Train Loss:  0.8679 | Train Acc:  0.6423
	Val Loss:  0.8071 | Val Acc:  0.6375
epoch: 03
	Train Loss:  0.7577 | Train Acc:  0.8100
	Val Loss:  0.7282 | Val Acc:  0.8200
epoch: 04
	Train Loss:  0.6950 | Train Acc:  0.8580
	Val Loss:  0.7195 | Val Acc:  0.8275
epoch: 05
	Train Loss:  0.6835 | Train Acc:  0.8692
	Val Loss:  0.7168 | Val Acc:  0.8350
epoch: 06
	Train Loss:  0.6706 | Train Acc:  0.8800
	Val Loss:  0.7039 | Val Acc:  0.8475
epoch: 07
	Train Loss:  0.6715 | Train Acc:  0.8794
	Val Loss:  0.7021 | Val Acc:  0.8500
epoch: 08
	Train Loss:  0.6635 | Train Acc:  0.8875
	Val Loss:  0.7140 | Val Acc:  0.8350
epoch: 09
	Train Loss:  0.6650 | Train Acc:  0.8868
	Val Loss:  0.7008 | Val Acc:  0.8500
epoch: 10
	Train Loss:  0.6680 | Train Acc:  0.8831
	Val Loss:  0.6978 | Val Acc:  0.8550
epoch: 11
	Train Loss:  0.6600 | Train Acc:  0.8893
	Val Loss:  0.6984 | Val Acc:  0.8525
epoch: 12
