In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from textblob import TextBlob 
from collections import Counter
import warnings as wn

wn.filterwarnings("ignore")

In [2]:
df = pd.read_csv("data.csv")

## <font color = "brown"> One Hot Encoding </font>

In [3]:
def BagOfWords(Corpus):
    
    # Returns the entire bag of words
    BoW = np.array([])
    for i in Corpus.index:
        
        text = Corpus.loc[i]
        
        # Identify words in text
        text = text.split()
        
        # Concatenate with previously identified texts
        BoW = np.concatenate((BoW, np.array(text)), axis = None)
        BoW = np.unique(BoW)
    
    return BoW

def TransformOHE(df, BoW):

    dfOHE = df.copy()
    
    # Transform dataframe into BoW
    for word in BoW:
        dfOHE[word] = np.where(df["Comments"].str.contains(word), 1, 0)

    # Move y variable to the left
    sentiment = dfOHE["Sentiment"]
    del dfOHE["Sentiment"], dfOHE["Comments"]
    dfOHE["Sentiment"] = sentiment
    
    return dfOHE

In [4]:
BoW = BagOfWords(df["Comments"])
dfOHE = TransformOHE(df, BoW)
dfOHE.to_csv("Output/OHE.csv", index = False)
dfOHE

Unnamed: 0,although,amazed,bad,beautiful,bob,caution,clear,concept,definitely,difficult,...,taking,taught,terrible,though,time,trying,understand,warned,way,Sentiment
0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,Negative
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Positive
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,Positive
3,0,0,0,0,0,1,0,0,0,1,...,1,1,0,0,0,0,0,0,0,Negative
4,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,1,1,0,0,Negative
5,0,0,0,1,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,1,Positive
6,0,0,1,0,1,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,Negative
7,1,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,Positive
8,0,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,Negative


## <font color = "brown"> Bag of Words </font>

In [5]:
def BagOfWords(Corpus):
    
    # Returns the entire bag of words
    BoW = np.array([])
    for i in Corpus.index:
        
        text = Corpus.loc[i]
        
        # Identify words in text
        text = text.split()
        
        # Concatenate with previously identified texts
        BoW = np.concatenate((BoW, np.array(text)), axis = None)
        BoW = np.unique(BoW)
    
    return BoW

def TransformBoW(df, BoW):

    dfBoW = df.copy()
    
    # Transform dataframe into BoW
    for word in BoW:
        dfBoW[word] = df["Comments"].str.count(word)

    # Move y variable to the left
    sentiment = dfBoW["Sentiment"]
    del dfBoW["Sentiment"], dfBoW["Comments"]
    dfBoW["Sentiment"] = sentiment
    
    return dfBoW

In [6]:
BoW = BagOfWords(df["Comments"])
dfBoW = TransformBoW(df, BoW)
dfBoW.to_csv("Output/BoW.csv", index = False)
dfBoW

Unnamed: 0,although,amazed,bad,beautiful,bob,caution,clear,concept,definitely,difficult,...,taking,taught,terrible,though,time,trying,understand,warned,way,Sentiment
0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,Negative
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Positive
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,Positive
3,0,0,0,0,0,1,0,0,0,1,...,1,1,0,0,0,0,0,0,0,Negative
4,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,1,1,0,0,Negative
5,0,0,0,1,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,1,Positive
6,0,0,2,0,1,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,Negative
7,1,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,Positive
8,0,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,Negative


## <font color = "brown"> N-Grams </font>

In [9]:
def BagOfNGrams(Corpus, N):
    
    # Returns the entire bag of words
    BoN = np.array([])
    for i in Corpus.index:
        
        text = Corpus.loc[i]
        
        # Identify N-Grams in text
        text = text.split()
        text = [" ".join(text[k:k+N]) for k in range(len(text) - N + 1)]
        
        # Concatenate with previously identified N-Grams
        BoN = np.concatenate((BoN, np.array(text)), axis = None)
        BoN = np.unique(BoN)
    
    return BoN

def TransformBoN(df, BoN):

    dfBoN = df.copy()
    
    # Transform dataframe into BoW
    for grams in BoN:
        dfBoN[grams] = df["Comments"].str.count(grams)

    # Move y variable to the left
    sentiment = dfBoN["Sentiment"]
    del dfBoN["Sentiment"], dfBoN["Comments"]
    dfBoN["Sentiment"] = sentiment
    
    return dfBoN

In [11]:
BoN = BagOfNGrams(df["Comments"], 2)
dfBoN = TransformBoN(df, BoN)
dfBoN.to_csv("Output/BoN.csv", index = False)
dfBoN

Unnamed: 0,although warned,amazed much,bad experience,bad lecturer,beautiful clear,bob bad,bob terrible,caution taking,clear practical,concept still,...,taking subject,taught difficult,terrible lecturer,though bad,time trying,trying understand,understand concept,warned people,way order,Sentiment
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Negative
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Positive
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Positive
3,0,0,0,0,0,0,0,1,0,0,...,1,1,0,0,0,0,0,0,0,Negative
4,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,1,1,0,0,Negative
5,0,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,Positive
6,0,0,1,1,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,Negative
7,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,Positive
8,0,0,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,Negative


## <font color = "brown"> TF-IDF </font>

$$
\begin{aligned}
\\
\\
\text{TF}(t_i, d_j) &= \frac{\text{Number of occurence of } t_i \text{ in } d_j}{\text{Number of words in }d_j} \\
\\
\\
\text{IDF}(t_i) &= \log\left(\frac{\text{Total number of documents}}{\text{Number of documents containing } t_i }\right)\\
\\
\\
\text{IDF}(t_i) &= \log\left(\frac{\text{Total number of documents} + 1}{\text{Number of documents containing } t_i  + 1}\right) + 1\\
\\
\\
\text{TF-IDF}_{i, j} &= \text{TF}(t_i, d_j) \times \text{IDF}(t_i) \\
\\
\\
\end{aligned}
$$

In [12]:
def BagOfWords(Corpus):
    
    # Returns the entire bag of words
    BoW = np.array([])
    for i in Corpus.index:
        
        text = Corpus.loc[i]
        
        # Identify words in text
        text = text.split()
        
        # Concatenate with previously identified texts
        BoW = np.concatenate((BoW, np.array(text)), axis = None)
        BoW = np.unique(BoW)
    
    return BoW

def TFIDF(df, BoW, smoothing = False):
    
    dftf = df.copy()
    
    # Total number of documents (rows)
    NumberOfDocuments = len(df)
    
    
    for word in BoW:
        
        # Occurence of word in the document
        FrequencyOfWord = df["Comments"].apply(lambda x: x.split().count(word))
        
        # Total number of words in the document
        WordInDocuments = df["Comments"].apply(lambda x: len(x.split()))
        
        # Number of documents in the entire dataframe that contains the word
        DocumentWithWords = len(df[df["Comments"].apply(lambda x: word in x.split())])
        
        # Calculate TF
        tf = FrequencyOfWord / WordInDocuments
        
        # Calculate IDF
        if smoothing:
            idf = np.log((NumberOfDocuments+1) / (DocumentWithWords+1)) + 1
        else:
            idf = np.log(NumberOfDocuments / DocumentWithWords)
        
        # Calculate TF-IDF
        dftf[word] = tf * idf
        
        # Smoothing
        # idf = np.log((1 + NumberOfDocuments)/(1 + DocumentWithWords)) + 1
        
    # Move y variable to the left
    sentiment = dftf["Sentiment"]
    del dftf["Sentiment"], dftf["Comments"]
    dftf["Sentiment"] = sentiment
    
    return dftf

In [13]:
BoW = BagOfWords(df["Comments"])
dftf = TFIDF(df, BoW)
dftf.to_csv("Output/TFIDF.csv", index = False)
dftf

Unnamed: 0,although,amazed,bad,beautiful,bob,caution,clear,concept,definitely,difficult,...,taking,taught,terrible,though,time,trying,understand,warned,way,Sentiment
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.549306,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Negative
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Positive
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.300815,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Positive
3,0.0,0.0,0.0,0.0,0.0,0.244136,0.0,0.0,0.0,0.122068,...,0.244136,0.244136,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Negative
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.136734,0.0,0.0,...,0.0,0.0,0.0,0.0,0.199748,0.199748,0.199748,0.0,0.0,Negative
5,0.0,0.0,0.0,0.183102,0.0,0.0,0.183102,0.0,0.0,0.091551,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.183102,Positive
6,0.0,0.0,0.439445,0.0,0.150408,0.0,0.0,0.0,0.219722,0.0,...,0.0,0.0,0.0,0.219722,0.0,0.0,0.0,0.0,0.0,Negative
7,0.156945,0.156945,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.156945,0.0,Positive
8,0.0,0.0,0.0,0.0,0.300815,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.439445,0.0,0.0,0.0,0.0,0.0,0.0,Negative
