In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
#Import the necessary libraries
import pandas as pd
import numpy as np
import scipy as sp
import re
import nltk
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from copy import deepcopy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import pairwise_distances
import pickle

In [0]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [0]:
# Reading in the data
train = pd.read_csv("/content/drive/My Drive/WebProject/labeledTrainData.csv", delimiter="\t",names=["tweet", "label", "labelValue"])
train["label"].replace({"spam": "neutral", "normal": "neutral"}, inplace=True)
train.head()



Unnamed: 0,tweet,label,labelValue
0,Beats by Dr. Dre urBeats Wired In-Ear Headphon...,neutral,4
1,RT @Papapishu: Man it would fucking rule if we...,abusive,4
2,It is time to draw close to Him &#128591;&#127...,neutral,4
3,if you notice me start to act different or dis...,neutral,5
4,"Forget unfollowers, I believe in growing. 7 ne...",neutral,3


In [0]:
def preprocess_review(review):
    """Helper function to clean the reviews.

     Arg: review: review text.
     Returns: clean_review : Cleaned reviews

     You should carry out the following steps.
     1. Remove HTML Tags.
     2. Remove non-letter characters.
     3. Convert to lower case.
     4. Remove stopwords.
    """

    #Write your code below.
    textwithoutHTML = BeautifulSoup(review).get_text()
    
    nonLetters = re.sub("[^a-zA-Z]"," ",textwithoutHTML)
    
    lowerCase = nonLetters.lower().split()

    
    stopWords = set(stopwords.words("english"))                  

    meaningful_words = [w for w in lowerCase if not w in stopWords]   

    clean_review = (" ".join(meaningful_words))
    return clean_review

    
    


In [0]:

#Clean the reviews and add them to the list below
cleaned_reviews = []
cleaned_test_reviews = []
#Write your code below.


num_reviews = train["tweet"].size
#num_rev_t = test["review"].size

for i in range(0, num_reviews):
    cleaned_reviews.append((preprocess_review(train["tweet"][i])))







In [0]:
def design_matrix(cleaned_reviews):

    
    #1 Generating bag of words with scikit's Count vectorizer
    
    cv = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000)
    
    X_counts = cv.fit_transform(cleaned_reviews)
    
    #will use it later while testing on the test set
    pickle.dump(cv, open("vectorizer.pickle", "wb"))

    
    #2. 
    #using X_counts to prepare X_binary
    X_binary = deepcopy(X_counts)
    X_binary[X_binary > 0] = 1
   

    #3. Computing the tfidf values
    tfidf_transformer=TfidfTransformer(smooth_idf=False)
    tfidf_transformer.fit(X_counts)
    X_tfidf=tfidf_transformer.transform(X_counts)
    
     #will use it later while testing on the test set
    pickle.dump(tfidf_transformer, open("tfidf.pickle", "wb"))

    #4. Generating an imbalanced dataset
    X_binary_imbalance = []
    imbalance_train = []
    
    pos_sent = train[train["label"] == 1] #get positive sentiment data
    pos_sent = pos_sent.sample(frac = 0.75, random_state = 0) #select data randomly
    imbalance_train = train.drop(pos_sent.index) # delete the data belonging to those indexes
    
    X_binary_imbalance = X_binary[imbalance_train.index] #use the skewed datset to get X_binary_imbalance
    

    return X_counts,X_binary,X_tfidf,X_binary_imbalance,imbalance_train

In [0]:

X_counts,X_binary,X_tfidf,X_binary_imbalance,imbalance_train = design_matrix(cleaned_reviews)


In [0]:
X_counts

<99996x5000 sparse matrix of type '<class 'numpy.int64'>'
	with 815121 stored elements in Compressed Sparse Row format>

In [0]:
X_binary

<99996x5000 sparse matrix of type '<class 'numpy.int64'>'
	with 815121 stored elements in Compressed Sparse Row format>

In [0]:
X_tfidf

<99996x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 815121 stored elements in Compressed Sparse Row format>

In [0]:
X_binary_imbalance

<99996x5000 sparse matrix of type '<class 'numpy.int64'>'
	with 815121 stored elements in Compressed Sparse Row format>

In [0]:
# Obtain the label on the original train set and imbalance train set
train_sentiment = train["label"].values
train_sentiment = train_sentiment[:50000]

imbalance_train_sentiment = imbalance_train["label"].values


In [0]:
import matplotlib.pyplot as plt
import random as rd

from sklearn import svm
from sklearn.svm import LinearSVC 
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, KFold
from sklearn.model_selection import GridSearchCV
from sklearn.utils.fixes import loguniform
from sklearn.model_selection import cross_val_score

In [0]:
def calculateF1(X, y, k = 5):
    """calculateF1(X, y, k = 5) return two list which record all randomly selected c(in the interval (1e-4, 1e4))
     and corresponding F1 scores.

     Args: X: Features
           y: Label of sentiment
           k: Number of Cross-validation

     Returns: c_list: List of all c values.
              f1_list: Corresponding F1 Scores.
    """
    rd.seed(0) #Setting a common seed

    #Write your code here.
    c_list = []
    f1_list = []
    cross_val = KFold(n_splits=k)
    
    
    for i in range(10):
        i = rd.uniform(-4, 4)
        c = 10**i
        f1_sum = 0
        model = LinearSVC(C = c)        
        fscores = cross_val_score(model, X, y, cv=cross_val, scoring='f1_weighted')
        fscores = fscores.mean()
        c_list.append(c)
        f1_list.append(fscores)
        
  
    return c_list, f1_list
        
    

In [0]:
def findBestC(X, y, k = 5):
    """findBestC(X, y, k) return the best performance c, and the improvement(difference between best and worst f1_scores)/
     Args: X: Features
           y: Label of sentiment
           k: Number of Cross-validation
     Returns: c_best: C value with best f1_score.
              improvement: difference between best and worst f1_score.
    """
    #Write your code here.
    c_list, f1_list = calculateF1(X, y, k = 5)
    #for i in range (len(c_list)):
      #print ("C:{} F:{}".format(c_list[i], f1_list[i]))

    bestf1 = np.amax(f1_list)
    worstf1 = np.amin(f1_list)
    improvement   = bestf1 - worstf1
    indexb = np.argmax(f1_list)
    c_best = c_list[indexb]    
    
    return c_best,improvement

In [0]:
def findImprovement(X,train_sentiment,test_size = 0.2, random_state = 0):
    """ Find the improvement in F1-Score of the design Matrix(X) using previous utility functions and the test_f1_score using the best C.

      Args: X: Design Matrix
            train_sentiment: Sentiments of the training data
            test_size: Split it as 80:20
            random_state: Seed

      Returns:
            c_best: The best possible c value
            improvement: improvement in F1-Score using the design Matrix(X).
            f1_s: Test F1 Score.
            

      You should carry out the following Steps:
      1. Split the data using the above parameters.
      2. Find out the best c and the improvement. (use 5-fold Cross Validation.)
      3. Find out the test f1 score with this c.
    """
    #Write your code here.
    train_ip, test_ip, train_op, test_op = train_test_split(X,
                                                          train_sentiment,
                                                          stratify=train_sentiment,
                                                            test_size = 0.2,
                                                            random_state=0)
                
    c_best, improvement = findBestC(train_ip, train_op, k = 5)
    model = LinearSVC(C = c_best)
    model.fit(train_ip, train_op)
    test_pred = model.predict(test_ip)
    f1_s = f1_score(test_op,test_pred,average='weighted')
    
    

    return c_best,improvement,f1_s

In [0]:
#Print the improvement using X_counts and the test f1_score using the best c.
#Write your code here.
c_best,improvement,f1_s = findImprovement(X_counts,train_sentiment)

print("The improvement using X_counts is {} and f1_score is {} with best c {}".format(improvement,f1_s,c_best))



The improvement using X_counts is 0.02750623944746411 and f1_score is 0.9063935254854274 with best c 0.173569374988568


In [0]:
# Retrain the classifier using the entire learning set with c_best
#Write your code here.
model = svm.LinearSVC(C = c_best)
model.fit(X_counts, train_sentiment)

LinearSVC(C=0.010093707860257255, class_weight=None, dual=True,
          fit_intercept=True, intercept_scaling=1, loss='squared_hinge',
          max_iter=1000, multi_class='ovr', penalty='l2', random_state=None,
          tol=0.0001, verbose=0)

#### 3.2.3 Tune an SVM classifier using X_tf_idf

In [0]:
#Print the improvement using X_tf_idf and the test f1_score using the best c.
#Write your code here.
c_best,improvement,f1_s = findImprovement(X_tfidf,train_sentiment)
print("The improvement using X_tfidf is {} and f1_score is {} with best c {}".format(improvement,f1_s,c_best))



The improvement using X_tfidf is 0.02989457480753155 and f1_score is 0.9025273788948042 with best c 0.6497939047431794


In [0]:
# Retrain svm using all X_tfidf data
#Write your code here.
model = svm.LinearSVC(C = c_best)
model.fit(X_tfidf, train_sentiment)

LinearSVC(C=0.6497939047431794, class_weight=None, dual=True,
          fit_intercept=True, intercept_scaling=1, loss='squared_hinge',
          max_iter=1000, multi_class='ovr', penalty='l2', random_state=None,
          tol=0.0001, verbose=0)

In [0]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import learning_curve

In [0]:
training_instances = [100, 500, 1000, 2000, 3000, 4000, 5000, 7500, 10000, 15000, 20000]
#Use the learning_curve module to generate mean train and test scores and plot them with X-axis being the number of training instances and Y-axis.
#Please add appropriate title,labels and legends.
#Write your code here.
tr, train_scores, test_scores = learning_curve(LogisticRegression(max_iter=500),
                                               X_counts, 
                                               train_sentiment,
                                               cv = 5,
                                               train_sizes = training_instances)


train_mean =  - train_scores.mean(axis = 1) #mean across rows (axis=1)
test_mean =  - test_scores.mean(axis = 1)
#print ("train mean",train_mean)
#print ("test mean",test_mean)
# learning curve
plt.figure(figsize=(8, 8))
plt.grid()
plt.title("Learning Curve X_count")
plt.xlabel("Training sizes")
plt.ylabel("Score")
plt.plot(training_instances, train_mean, 'o-', color = "r", label = "Training error")
plt.plot(training_instances, test_mean, 'o-', color = "g", label = "Testing error")
plt.legend(loc = "best")
plt.show()