In [1]:
#tf-idf vectorizer for text classification using sklearn
from sklearn.feature_extraction.text import TfidfVectorizer


In [2]:
data_string = [
    "Thor eating pizza, Loki is eating pizza, Ironman ate pizza already",
    "Apple is announcing new iphone tomorrow",
    "Tesla is announcing new model-3 tomorrow",
    "Google is announcing new pixel-6 tomorrow",
    "Microsoft is announcing new surface tomorrow",
    "Amazon is announcing new eco-dot tomorrow",
    "I am eating biryani and you are eating grapes"
]

In [3]:
v = TfidfVectorizer()
vectorize_data = v.fit_transform(data_string)


In [4]:
print(v.get_feature_names_out())  #this will give all the unique words in the dataset 


['already' 'am' 'amazon' 'and' 'announcing' 'apple' 'are' 'ate' 'biryani'
 'dot' 'eating' 'eco' 'google' 'grapes' 'iphone' 'ironman' 'is' 'loki'
 'microsoft' 'model' 'new' 'pixel' 'pizza' 'surface' 'tesla' 'thor'
 'tomorrow' 'you']


In [5]:
all_featured_names = v.get_feature_names_out()
for featurename in all_featured_names:
    indx = v.vocabulary_.get(featurename)
    print(f"{indx} : {featurename} :: {v.idf_[indx]}")  #this will give the index of each word and its tf-idf value
    

0 : already :: 2.386294361119891
1 : am :: 2.386294361119891
2 : amazon :: 2.386294361119891
3 : and :: 2.386294361119891
4 : announcing :: 1.2876820724517808
5 : apple :: 2.386294361119891
6 : are :: 2.386294361119891
7 : ate :: 2.386294361119891
8 : biryani :: 2.386294361119891
9 : dot :: 2.386294361119891
10 : eating :: 1.9808292530117262
11 : eco :: 2.386294361119891
12 : google :: 2.386294361119891
13 : grapes :: 2.386294361119891
14 : iphone :: 2.386294361119891
15 : ironman :: 2.386294361119891
16 : is :: 1.1335313926245225
17 : loki :: 2.386294361119891
18 : microsoft :: 2.386294361119891
19 : model :: 2.386294361119891
20 : new :: 1.2876820724517808
21 : pixel :: 2.386294361119891
22 : pizza :: 2.386294361119891
23 : surface :: 2.386294361119891
24 : tesla :: 2.386294361119891
25 : thor :: 2.386294361119891
26 : tomorrow :: 1.2876820724517808
27 : you :: 2.386294361119891


In [6]:
import pandas as pd 
df = pd.read_csv("IMDB_Dataset.csv")
print(df.value_counts("sentiment"))
df.head()

sentiment
negative    25000
positive    25000
Name: count, dtype: int64


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [7]:
df['sentiment_num'] = df.sentiment.map({'positive':1, 'negative':0})
#mapping the sentiment column to the numerical values as Machine learning models work on numerical values only

In [8]:
df.head()

Unnamed: 0,review,sentiment,sentiment_num
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1


In [9]:
#now splitting the data into train and test set
from sklearn.model_selection import train_test_split 


NameError: name 'df' is not defined

In [10]:
min_sample = 1000
min_pos = df[df.sentiment == "positive"].sample(min_sample, random_state=42)
min_neg = df[df.sentiment == "negative"].sample(min_sample, random_state=42)

In [30]:
min_df = pd.concat([min_pos, min_neg], axis=0)
min_df.shape
min_df.head()


Unnamed: 0,review,sentiment,sentiment_num
13886,I don't know how or why this film has a meager...,positive,1
48027,For a long time it seemed like all the good Ca...,positive,1
19536,Terry Gilliam's and David Peoples' teamed up t...,positive,1
27232,What is there to say about an anti-establishme...,positive,1
28001,This movie was made only 48 years after the en...,positive,1


In [12]:
min_df.sentiment_num.value_counts()

sentiment_num
1    1000
0    1000
Name: count, dtype: int64

In [36]:
#divinding the dataset into train and test dataset 
X_train, X_test, y_train, y_test = train_test_split(df.review, 
                                                    df.sentiment_num, 
                                                    test_size=0.2, #20% of data will be in test set
                                                    random_state=42,
                                                    stratify=df.sentiment_num)

In [13]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer



In [14]:
classifier = Pipeline([
    ('tfidf', TfidfVectorizer()),  #step-1 : tf-idf vectorization
    ('model', MultinomialNB())     #step-2 : model training using Naive bayes classifier
])

In [15]:
classifier.fit(X_train, y_train)  #fitting the model on the training data

0,1,2
,steps,"[('tfidf', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [16]:

y_predict = classifier.predict(X_test)  #predicting the output on the test data

print(classification_report(y_test, y_predict))


              precision    recall  f1-score   support

           0       0.85      0.89      0.87      5000
           1       0.88      0.84      0.86      5000

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000



In [17]:
X_test[:5]

18870    Yes, MTV there really is a way to market Daria...
39791    The story of the bride fair is an amusing and ...
30381    A team varied between Scully and Mulder, two o...
42294    This was a popular movie probably because of t...
33480    This movie made me so angry!! Here I am thinki...
Name: review, dtype: object

In [20]:
y_test[:7]

18870    0
39791    0
30381    1
42294    0
33480    0
44918    0
32133    0
Name: sentiment_num, dtype: int64

In [19]:
y_predict[:7]

array([0, 1, 1, 0, 0, 0, 0])

In [18]:
#using randomforest classifier to train the model
from sklearn.ensemble import RandomForestClassifier

clf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('model', RandomForestClassifier())
])

clf.fit(X_train, y_train)


NameError: name 'X_train' is not defined

In [25]:
y_predct = clf.predict(X_test)
print(classification_report(y_test, y_predct))

              precision    recall  f1-score   support

           0       0.84      0.85      0.85      5000
           1       0.85      0.83      0.84      5000

    accuracy                           0.84     10000
   macro avg       0.84      0.84      0.84     10000
weighted avg       0.84      0.84      0.84     10000



In [14]:
#now let's do preprocessing and then see the results 
import spacy 
nlp = spacy.load("en_core_web_sm")
def preprocess_text(text):
    doc = nlp(text)
    filtered_doc = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_doc.append(token.lemma_)    
    
    return " ".join(filtered_doc)

In [33]:
min_df["preprocessed_review"] = min_df.review.apply(preprocess_text)

In [34]:
min_df.head()

Unnamed: 0,review,sentiment,sentiment_num,preprocessed_review
13886,I don't know how or why this film has a meager...,positive,1,know film meager rating IMDb film accompany cu...
48027,For a long time it seemed like all the good Ca...,positive,1,long time like good canadian actor head south ...
19536,Terry Gilliam's and David Peoples' teamed up t...,positive,1,Terry Gilliam David Peoples team create intell...
27232,What is there to say about an anti-establishme...,positive,1,anti establishment film produce time colourles...
28001,This movie was made only 48 years after the en...,positive,1,movie 48 year end Civil War likely anticipatio...


In [37]:
X_train, X_test, y_train, y_test = train_test_split(min_df.preprocessed_review, 
                                                    min_df.sentiment_num, 
                                                    test_size=0.2, #20% of data will be in test set
                                                    random_state=42,
                                                    stratify=min_df.sentiment_num)

In [38]:
from sklearn.ensemble import RandomForestClassifier
clf_preprocess = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('model', RandomForestClassifier())
])

In [39]:
clf_preprocess.fit(X_train, y_train)

0,1,2
,steps,"[('tfidf', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [41]:
y_predict_preprocess = clf_preprocess.predict(X_test)
print(classification_report(y_test, y_predict_preprocess))

              precision    recall  f1-score   support

           0       0.75      0.77      0.76       200
           1       0.77      0.75      0.76       200

    accuracy                           0.76       400
   macro avg       0.76      0.76      0.76       400
weighted avg       0.76      0.76      0.76       400

