<a href="https://colab.research.google.com/github/gupta24789/sentiment-analysis/blob/main/bow_tfidf_ngram_sklearn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## We will explore below techniques

- BOW
- TFIDF
- N-Grams with bow and tfidf


In [36]:
!pip install neattext



In [98]:
import pandas as pd
import numpy as np
import itertools
import neattext as nt
from neattext.functions import clean_text
from collections import Counter

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline

## Read Data

In [117]:
train_df = pd.read_csv("https://raw.githubusercontent.com/gupta24789/sentiment-analysis/main/data/train.csv")
val_df = pd.read_csv("https://raw.githubusercontent.com/gupta24789/sentiment-analysis/main/data/val.csv")

train_df.processed_tweet = train_df.processed_tweet.fillna('[]').apply(lambda x: eval(x) if x is not None else [])
val_df.processed_tweet = val_df.processed_tweet.fillna('[]').apply(lambda x: eval(x) if x is not None else [])

train_df = train_df[['raw_tweet','label']].dropna().reset_index(drop = True)
val_df = val_df[['raw_tweet','label']].dropna().reset_index(drop = True)

In [118]:
## Label distribution
train_df.label.value_counts()

1.0    4000
0.0    4000
Name: label, dtype: int64

In [119]:
train_df.head(3)

Unnamed: 0,raw_tweet,label
0,Want to say a huge thanks to @WarriorAssaultS ...,1.0
1,@jaynehh_ you just need a job and get a letter...,1.0
2,"@knhillrocks HA yes, make it quick tho :D",1.0


## Clean Text

In [120]:
train_df.raw_tweet.apply(lambda x: nt.TextFrame(x).noise_scan())

0       {'text_noise': 11.76470588235294, 'text_length...
1       {'text_noise': 14.17910447761194, 'text_length...
2       {'text_noise': 12.195121951219512, 'text_lengt...
3       {'text_noise': 17.56756756756757, 'text_length...
4       {'text_noise': 11.940298507462686, 'text_lengt...
                              ...                        
7995    {'text_noise': 17.582417582417584, 'text_lengt...
7996    {'text_noise': 29.411764705882355, 'text_lengt...
7997    {'text_noise': 13.953488372093023, 'text_lengt...
7998    {'text_noise': 16.842105263157894, 'text_lengt...
7999    {'text_noise': 36.36363636363637, 'text_length...
Name: raw_tweet, Length: 8000, dtype: object

In [121]:
train_df.raw_tweet.apply(lambda x: nt.TextFrame(x).count_stopwords())

0         {'to': 2, 'say': 1, 'a': 1, 'for': 1, 'the': 1}
1       {'you': 2, 'just': 1, 'a': 3, 'and': 2, 'get':...
2                                    {'make': 1, 'it': 1}
3       {'for': 1, 'me': 1, 'back': 1, 'not': 1, 'you'...
4                   {'out': 1, 'a': 1, 'for': 1, 'my': 1}
                              ...                        
7995                                            {'it': 1}
7996                                  {'why': 1, 'so': 1}
7997                                            {'to': 1}
7998    {'my': 1, 'were': 1, 'and': 1, 'there': 1, 'wa...
7999               {'why': 1, 'am': 1, 'i': 1, 'this': 1}
Name: raw_tweet, Length: 8000, dtype: object

In [123]:
## another way to clean the text
train_df.raw_tweet.apply(lambda x: nt.TextCleaner(clean_text(x, puncts=True,
                                                stopwords=True,
                                                urls=True,
                                                emails=True,
                                                numbers=True,
                                                emojis=False,
                                                special_char=False,
                                                phone_num=True,
                                                non_ascii=False,
                                                multiple_whitespaces=True,
                                                contractions=False,
                                                currency_symbols=False,
                                                custom_pattern=None)).remove_userhandles().remove_multiple_spaces())

0                  want huge thanks #ff thanks support :)
1        need job letter work place saying work letter...
2                                     ha yes quick tho :d
3              thanks texting :)) im texting tomorrow :))
4       laying greetings card range print today love j...
                              ...                        
7995     im watching madrid leads supersport aint show...
7996                                           sudden :((
7997                       :((( bout instant transmission
7998            feet cold point taking socks okay stop :(
7999                             oppa/hug ur arm/ like :(
Name: raw_tweet, Length: 8000, dtype: object

In [124]:
def custom_clean_text(x):
  x = nt.TextFrame(x)
  x = x.remove_stopwords().remove_urls().remove_emails().remove_dates().remove_puncts().remove_numbers().remove_userhandles().remove_multiple_spaces()
  x = x.text.lower()
  return x

In [125]:
train_df['processed_text'] = train_df.raw_tweet.apply(lambda x: custom_clean_text(x))
val_df['processed_text'] = val_df.raw_tweet.apply(lambda x: custom_clean_text(x))

In [126]:
train_df.head(3)

Unnamed: 0,raw_tweet,label,processed_text
0,Want to say a huge thanks to @WarriorAssaultS ...,1.0,want huge thanks #ff thanks support :)
1,@jaynehh_ you just need a job and get a letter...,1.0,need job letter work place saying work letter...
2,"@knhillrocks HA yes, make it quick tho :D",1.0,ha yes quick tho :d


In [134]:
X_train = train_df.processed_text
y_train = train_df.label

X_val = val_df.processed_text
y_val = val_df.label

In [164]:
sample_tweet = train_df.processed_text[0]
print(sample_tweet)

want huge thanks #ff thanks support :)


## BOW

In [165]:
vectorizer = CountVectorizer(lowercase=True, ngram_range = (1,1))
vectorizer

In [168]:
processed_tweet = vectorizer.fit_transform([sample_tweet])
print(processed_tweet.toarray())
print(vectorizer.vocabulary_)

[[1 1 1 2 1]]
{'want': 4, 'huge': 1, 'thanks': 3, 'ff': 0, 'support': 2}


In [169]:
## Lets create the pipeline usign BOW
classifier = Pipeline([
    ('count_vectorizer', CountVectorizer(lowercase=True, ngram_range = (1,1))),
    ('bow_model', RandomForestClassifier(n_estimators=100, n_jobs=-1))
])

In [170]:
classifier.fit(X_train, y_train)

In [171]:
y_pred = classifier.predict(X_val)

In [172]:
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.71      0.80      0.75      1000
           1       0.77      0.68      0.72      1000

    accuracy                           0.74      2000
   macro avg       0.74      0.74      0.74      2000
weighted avg       0.74      0.74      0.74      2000



In [173]:
input_text = 'i love this movie'
clean_input_text = custom_clean_text(input_text)
classifier.predict([clean_input_text])

array([0.])

In [174]:
input_text = 'i hate this movie'
clean_input_text = custom_clean_text(input_text)
classifier.predict([clean_input_text])

array([0.])

## BOW using ngram

In [175]:
vectorizer = CountVectorizer(lowercase=True, ngram_range = (1,2))

In [177]:
processed_tweet = vectorizer.fit_transform([sample_tweet])
print(processed_tweet.toarray())
print(vectorizer.vocabulary_)

[[1 1 1 1 1 2 1 1 1 1]]
{'want': 8, 'huge': 2, 'thanks': 5, 'ff': 0, 'support': 4, 'want huge': 9, 'huge thanks': 3, 'thanks ff': 6, 'ff thanks': 1, 'thanks support': 7}


In [178]:
## Model building
classifier = Pipeline([
    ('count_vectorizer', CountVectorizer(lowercase=True, ngram_range = (1,2))),
    ('bow_model', RandomForestClassifier(n_estimators=100, n_jobs=-1))
])

classifier.fit(X_train, y_train)

In [179]:
y_pred = classifier.predict(X_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.70      0.83      0.76      1000
           1       0.79      0.65      0.71      1000

    accuracy                           0.74      2000
   macro avg       0.75      0.74      0.74      2000
weighted avg       0.75      0.74      0.74      2000



In [180]:
input_text = 'i love this movie'
clean_input_text = custom_clean_text(input_text)
classifier.predict([clean_input_text])

array([0.])

In [181]:
input_text = 'i hate this movie'
clean_input_text = custom_clean_text(input_text)
classifier.predict([clean_input_text])

array([0.])

## TFIDF

In [182]:
vectorizer = TfidfVectorizer(lowercase=True, ngram_range = (1,1))
vectorizer

In [183]:
processed_tweet = vectorizer.fit_transform([sample_tweet])
print(processed_tweet.toarray())
print(vectorizer.vocabulary_)

[[0.35355339 0.35355339 0.35355339 0.70710678 0.35355339]]
{'want': 4, 'huge': 1, 'thanks': 3, 'ff': 0, 'support': 2}


In [184]:
## Model building
classifier = Pipeline([
    ('count_vectorizer', TfidfVectorizer(lowercase=True, ngram_range = (1,1))),
    ('tfidf_model', RandomForestClassifier(n_estimators=100, n_jobs=-1))
])

classifier.fit(X_train, y_train)

In [185]:
y_pred = classifier.predict(X_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.71      0.80      0.75      1000
           1       0.77      0.67      0.72      1000

    accuracy                           0.73      2000
   macro avg       0.74      0.74      0.73      2000
weighted avg       0.74      0.73      0.73      2000



In [186]:
input_text = 'i love this movie'
clean_input_text = custom_clean_text(input_text)
classifier.predict([clean_input_text])

array([1.])

In [187]:
input_text = 'i hate this movie'
clean_input_text = custom_clean_text(input_text)
classifier.predict([clean_input_text])

array([0.])

## IFTDF with ngram

In [188]:
vectorizer = TfidfVectorizer(lowercase=True, ngram_range = (1,2))
vectorizer

In [189]:
processed_tweet = vectorizer.fit_transform([sample_tweet])
print(processed_tweet.toarray())
print(vectorizer.vocabulary_)

[[0.2773501 0.2773501 0.2773501 0.2773501 0.2773501 0.5547002 0.2773501
  0.2773501 0.2773501 0.2773501]]
{'want': 8, 'huge': 2, 'thanks': 5, 'ff': 0, 'support': 4, 'want huge': 9, 'huge thanks': 3, 'thanks ff': 6, 'ff thanks': 1, 'thanks support': 7}


In [190]:

## Model building
classifier = Pipeline([
    ('count_vectorizer', TfidfVectorizer(lowercase=True, ngram_range = (1,2))),
    ('tfidf_model', RandomForestClassifier(n_estimators=100, n_jobs=-1))
])

classifier.fit(X_train, y_train)

In [191]:
y_pred = classifier.predict(X_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.71      0.80      0.75      1000
           1       0.77      0.67      0.72      1000

    accuracy                           0.73      2000
   macro avg       0.74      0.73      0.73      2000
weighted avg       0.74      0.73      0.73      2000



In [192]:
input_text = 'i love this movie'
clean_input_text = custom_clean_text(input_text)
classifier.predict([clean_input_text])

array([1.])

In [193]:
input_text = 'i hate this movie'
clean_input_text = custom_clean_text(input_text)
classifier.predict([clean_input_text])

array([0.])

In [195]:
## Save model
import pickle

pickle.dump(classifier, open('model.pkl', 'wb'))

In [196]:
classifier = pickle.load(open('model.pkl', 'rb'))

In [197]:
input_text = 'i hate this movie'
clean_input_text = custom_clean_text(input_text)
classifier.predict([clean_input_text])

array([0.])