In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import nltk
%matplotlib inline

In [3]:
import glob

path = "../datasets/csv/" # use your path
all_movies = glob.glob(path + "/*.csv")

li = []

for filename in all_movies:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

all_movies_final = pd.concat(li, axis=0, ignore_index=True)
all_movies_final


Unnamed: 0.1,Unnamed: 0,rating,review date,review content
0,0,8.0,8 January 2021,Fern (Frances McDormand) is houseless and livi...
1,1,9.0,24 June 2021,When people put themselves in vulnerable posit...
2,2,6.0,2 August 2021,"""Nomadland"" is a new American movie that premi..."
3,3,7.0,18 February 2021,Nomadland is an exploration of people living i...
4,4,5.0,26 May 2021,"Frances McDormand loses her job in Empire, Nev..."
...,...,...,...,...
49069,504,10.0,5 November 2005,I first saw this movie in the theater when it ...
49070,505,10.0,11 January 2020,I saw this movie on the big screen when it was...
49071,506,10.0,22 June 2019,"Great movie!!!! I love the music, history and ..."
49072,507,10.0,22 November 2005,I am listening to the soundtrack as I pen thes...


Importing Datasets

In [4]:
data_all = pd.DataFrame(all_movies_final)
print(data_all)

data_all.columns = [c.replace(' ', '_') for c in data_all.columns]

data_all.dropna()

data_all_new = data_all[data_all["rating"] != -1]
data_all_new.head()

       Unnamed: 0  ...                                     review content
0               0  ...  Fern (Frances McDormand) is houseless and livi...
1               1  ...  When people put themselves in vulnerable posit...
2               2  ...  "Nomadland" is a new American movie that premi...
3               3  ...  Nomadland is an exploration of people living i...
4               4  ...  Frances McDormand loses her job in Empire, Nev...
...           ...  ...                                                ...
49069         504  ...  I first saw this movie in the theater when it ...
49070         505  ...  I saw this movie on the big screen when it was...
49071         506  ...  Great movie!!!! I love the music, history and ...
49072         507  ...  I am listening to the soundtrack as I pen thes...
49073         508  ...  The title infers a movie about dancing and a n...

[49074 rows x 4 columns]


Unnamed: 0,Unnamed:_0,rating,review_date,review_content
0,0,8.0,8 January 2021,Fern (Frances McDormand) is houseless and livi...
1,1,9.0,24 June 2021,When people put themselves in vulnerable posit...
2,2,6.0,2 August 2021,"""Nomadland"" is a new American movie that premi..."
3,3,7.0,18 February 2021,Nomadland is an exploration of people living i...
4,4,5.0,26 May 2021,"Frances McDormand loses her job in Empire, Nev..."


In [5]:
data_all_new['Sentiment'] = data_all_new['rating'].apply(lambda x: "positive" if x>=7 else("negative" if x<=4.9 else "neutral"))
data_all_new

data_all_new1 = pd.DataFrame({"review_content1":data_all_new["review_content"], "Sentiment1":data_all_new["Sentiment"]})
data_all_new1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,review_content1,Sentiment1
0,Fern (Frances McDormand) is houseless and livi...,positive
1,When people put themselves in vulnerable posit...,positive
2,"""Nomadland"" is a new American movie that premi...",neutral
3,Nomadland is an exploration of people living i...,positive
4,"Frances McDormand loses her job in Empire, Nev...",neutral
...,...,...
49069,I first saw this movie in the theater when it ...,positive
49070,I saw this movie on the big screen when it was...,positive
49071,"Great movie!!!! I love the music, history and ...",positive
49072,I am listening to the soundtrack as I pen thes...,positive


In [6]:
features = data_all_new1.iloc[:,0].values
labels = data_all_new1.iloc[:,1].values

processed_features = []

for sentence in range(0, len(features)):
  processed_feature = re.sub(r'\W', ' ', str(features[sentence])) #Remove all the special characters
  processed_feature = re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature) #Remove all single characters
  processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature) #Remove single characters from the start
  processed_feature = re.sub(r'\s+', ' ', processed_feature, flags  = re.I) #Substituting multiple spaces with single space
  processed_feature = re.sub(r'^b\s+', '', processed_feature) #Removing prefixed 'b
  processed_feature = processed_feature.lower() #Converting to lower case

  processed_features.append(processed_feature)


In [7]:
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=2500, min_df=7, max_df = 0.8, stop_words = stopwords.words('english'))
processed_features = vectorizer.fit_transform(processed_features).toarray()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(processed_features, labels, test_size = 0.2, random_state = 0)

In [9]:
from sklearn.ensemble import RandomForestClassifier

text_classifier = RandomForestClassifier(n_estimators = 200, random_state = 0)
text_classifier.fit(X_train, y_train)

RandomForestClassifier(n_estimators=200, random_state=0)

In [10]:
predictions = text_classifier.predict(X_test)

In [11]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))
print(accuracy_score(y_test, predictions))

[[ 410    1  900]
 [  66    5  716]
 [  44    2 6718]]
              precision    recall  f1-score   support

    negative       0.79      0.31      0.45      1311
     neutral       0.62      0.01      0.01       787
    positive       0.81      0.99      0.89      6764

    accuracy                           0.80      8862
   macro avg       0.74      0.44      0.45      8862
weighted avg       0.79      0.80      0.75      8862

0.8048973143759873
