In [1]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

In [45]:
stopwords_df = pd.read_csv('stopwords_data.csv')
stopwords_df

Unnamed: 0,max_features,N-gram,Accuracy (Stop Words removed),Accuracy (Stop Words not removed)
0,5000,Unigram,94.29,96.46
1,5000,Unigram and Bigram,94.82,97.63
2,8000,Unigram,94.63,96.59
3,8000,Unigram and Bigram,95.02,98.05
4,12000,Unigram,94.54,96.42
5,12000,Unigram and Bigram,95.29,98.08
6,No limit,Unigram,94.36,96.39
7,No limit,Unigram and Bigram,95.4,97.77


In [59]:
fig = go.Figure()
unigram = stopwords_df[stopwords_df["N-gram"] == "Unigram"]
fig.add_trace(go.Scatter(x=unigram["max_features"], y=unigram["Accuracy (Stop Words removed)"], name='Unigram (Stop Words removed)'))
fig.add_trace(go.Scatter(x=unigram["max_features"], y=unigram["Accuracy (Stop Words not removed)"], name='Unigram (Stop Words not removed)'))

unigram_and_bigram = stopwords_df[stopwords_df["N-gram"] == "Unigram and Bigram"]
fig.add_trace(go.Scatter(x=unigram_and_bigram["max_features"], y=unigram_and_bigram["Accuracy (Stop Words removed)"], name='Unigram and Bigram (Stop Words removed)'))
fig.add_trace(go.Scatter(x=unigram_and_bigram["max_features"], y=unigram_and_bigram["Accuracy (Stop Words not removed)"], name='Unigram and Bigram (Stop Words not removed)'))

fig.update_layout(title='Accuracy of RBF SVM Model with or without Stop Words',
                   xaxis_title='max_features',
                   yaxis_title='Accuracy (%)')

In [48]:
word_embeddings_df = pd.read_csv('word_embedding_data.csv')
word_embeddings_df

Unnamed: 0,Word Embedding Model,Text Vector Representation Calculation Technique,Accuracy
0,Word2Vec,Summation of vectors,82.05
1,Word2Vec,Mean vector,67.53
2,FastText,Summation of vectors,82.17
3,FastText,Mean vector,73.68
4,GloVe,Summation of vectors,82.85
5,GloVe,Mean vector,75.91


In [58]:

models = word_embeddings_df["Word Embedding Model"].unique()
techniques = word_embeddings_df["Text Vector Representation Calculation Technique"].unique()



fig2 = go.Figure(data=[
    go.Bar(name='Word2Vec', x=techniques, y=[82.05, 67.53]),
    go.Bar(name='FastText', x=techniques, y=[82.17, 73.68]),
    go.Bar(name='GloVe', x=techniques, y=[82.85, 75.91]),
])
# Change the bar mode
fig2.update_layout(barmode='group',title='Accuracy of RBF SVM Model with Different Word Embedding Models',
                   xaxis_title='Text Vector Representation Calculation Technique',
                   yaxis_title='Accuracy (%)')

In [51]:
ngram_df = pd.read_csv('max_features_up_to_50000.csv')
ngram_df

Unnamed: 0,max_features,N-gram,Kernel,Accuracy
0,5000,Unigram,Linear,95.92
1,5000,Unigram,RBF,96.66
2,5000,Unigram and Bigram,Linear,97.63
3,5000,Unigram and Bigram,RBF,98.06
4,5000,"Unigram, Bigram and Trigram",Linear,97.74
5,5000,"Unigram, Bigram and Trigram",RBF,97.98
6,10000,Unigram,Linear,95.93
7,10000,Unigram,RBF,96.64
8,10000,Unigram and Bigram,Linear,97.92
9,10000,Unigram and Bigram,RBF,98.07


In [32]:
x = ngram_df.loc[ngram_df["N-gram"] == "Unigram and Bigram"]
x.loc[x["Kernel"] == "Linear"]

Unnamed: 0,max_features,N-gram,Kernel,Accuracy
2,5000,Unigram and Bigram,Linear,97.63%
8,10000,Unigram and Bigram,Linear,97.92%
14,15000,Unigram and Bigram,Linear,97.89%
20,20000,Unigram and Bigram,Linear,98.03%
26,25000,Unigram and Bigram,Linear,98.06%
32,30000,Unigram and Bigram,Linear,98.10%
38,35000,Unigram and Bigram,Linear,98.08%
44,40000,Unigram and Bigram,Linear,98.25%
50,45000,Unigram and Bigram,Linear,98.03%
56,50000,Unigram and Bigram,Linear,98.13%


In [57]:
fig3 = go.Figure()
# plot a max_feature vs accuracy graph for all the n-grams
for ngram in ngram_df["N-gram"].unique():
    temp = ngram_df.loc[ngram_df["N-gram"] == ngram]
    for kernel in ngram_df["Kernel"].unique():
        
        fig3.add_trace(go.Scatter(x=temp[temp["Kernel"] == kernel]["max_features"], y=temp[temp["Kernel"] == kernel]["Accuracy"], name=f'{ngram} {kernel}'))

fig3.update_layout(title='Accuracy of SVM Model with Different Kernels and N-grams',
                   xaxis_title='max_features',
                   yaxis_title='Accuracy')

fig3.show()

In [53]:
trigram_df = pd.read_csv('linear_vs_rbf_trigrams_60000-100000.csv')
trigram_df

Unnamed: 0,max_features,N-gram,kernel,Accuracy
0,60000,"Unigram, Bigram and Trigram",Linear,98.22
1,60000,"Unigram, Bigram and Trigram",RBF,98.12
2,70000,"Unigram, Bigram and Trigram",Linear,98.22
3,70000,"Unigram, Bigram and Trigram",RBF,98.01
4,80000,"Unigram, Bigram and Trigram",Linear,98.41
5,80000,"Unigram, Bigram and Trigram",RBF,98.27
6,90000,"Unigram, Bigram and Trigram",Linear,98.44
7,90000,"Unigram, Bigram and Trigram",RBF,98.24
8,100000,"Unigram, Bigram and Trigram",Linear,98.22
9,100000,"Unigram, Bigram and Trigram",RBF,98.05


In [56]:
fig4 = go.Figure()
# plot a max_feature vs accuracy graph for all the n-grams
for ngram in trigram_df["N-gram"].unique():
    temp = trigram_df.loc[trigram_df["N-gram"] == ngram]
    for kernel in trigram_df["kernel"].unique():
        
        fig4.add_trace(go.Scatter(x=temp[temp["kernel"] == kernel]["max_features"], y=temp[temp["kernel"] == kernel]["Accuracy"], name=f'{ngram} {kernel}'))

fig4.update_layout(title='Accuracy of SVM Model with Different Kernels with Unigram, Bigram and Trigram',
                   xaxis_title='max_features',
                   yaxis_title='Accuracy')

fig4.show()