In [1]:
# PART - 1 TRAINING SVC AND NBC MODEL ON THE FILM Review DATASET
# PART - 2 UMAP
# PART - 3 DEPLOYMENT OF BETTER MODEL (NAIVE BAYES IN OUR MOVIE REVIEWS DATASAET)
# LABELLED DATASET:     1025 ROWS
# UNLABELLED DATASET:   40 ROWS

# Importing relevant libraries for the SVC and NBC and text classification
import re, nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('wordnet')
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from imblearn.over_sampling import SMOTE
import joblib

# Reading dataset as dataframe
df = pd.read_csv("/content/Labelled.csv")
pd.set_option('display.max_colwidth', None) # Setting this so we can see the full content of cells
pd.set_option('display.max_columns', None) # to make sure we can see all the columns in output window
# df.drop('Ratings', axis = 1) # Dropping the column of ratings as it played no significance to train our dataset

df['Label'] = df['Label'].map({'Positive':1, 'Negative':0}) # Assigning the positive label to 1 and negative label to 0

# Cleaning Review
def cleaner(Review):
    soup = BeautifulSoup(Review, 'lxml') # removing HTML entities such as ‘&amp’,’&quot’,'&gt'; lxml is the html parser and shoulp be installed using 'pip install lxml'
    souped = soup.get_text()
    re1 = re.sub(r"(#|@|http://|https://|www)\S*", " ", souped) # substituting hashtags, @mentions, urls, etc with whitespace
    re2 = re.sub("[^A-Za-z]+"," ", re1) # substituting any non-alphabetic character that repeats one or more times with whitespace

    tokens = nltk.word_tokenize(re2)
    lower_case = [t.lower() for t in tokens]

    stop_words = set(stopwords.words('english'))
    filtered_result = list(filter(lambda l: l not in stop_words, lower_case))

    wordnet_lemmatizer = WordNetLemmatizer()
    lemmas = [wordnet_lemmatizer.lemmatize(t) for t in filtered_result]
    return lemmas

df['cleaned_Review'] = df.Review.apply(cleaner)
df = df[df['cleaned_Review'].map(len) > 0] # Removing rows with cleaned summaries of length 0
print("Printing top 5 rows of dataframe showing original and cleaned summaries....")
print(df[['Review','cleaned_Review']].head())
df['cleaned_Review'] = [" ".join(row) for row in df['cleaned_Review'].values] # joining tokens to create strings. TfidfVectorizer does not accept tokens as input
data = df['cleaned_Review']
Y = df['Label'] # Assigning Target column

#Creation of TF-IDF Matrix based on min document frequency and max n gram range of 2.
tfidf = TfidfVectorizer(min_df=0.004, ngram_range=(1,3)) # min_df=.004 means that each ngram (unigram and bigram) must be present in at least 4 documents for it to be considered as a token (1000*.004=4). This is a clever way of feature engineering
tfidf.fit(data) # learn vocabulary of entire data
data_tfidf = tfidf.transform(data) # creating tfidf values
pd.DataFrame(pd.Series(tfidf.get_feature_names_out())).to_csv('vocabulary_film_Review.csv', header = False, index=False)
print("The created tokens: \n", tfidf.get_feature_names_out())
print("Shape of tfidf matrix: ", data_tfidf.shape)

# Implementing Support Vector Classifier
print("Implementing SVC.....")
model1 = LinearSVC() # kernel = 'linear' and C = 1

# Running cross-validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1) # 10-fold cross-validation
scores=[]
iteration = 0
smote = SMOTE(random_state = 101) # Dataset is imbalanced hence SMOTE has to be used 
for train_index, test_index in kf.split(data_tfidf, Y):
    iteration += 1
    print("Iteration ", iteration)
    X_train, Y_train = data_tfidf[train_index], Y[train_index]
    X_test, Y_test = data_tfidf[test_index], Y[test_index]
    X_train,Y_train = smote.fit_resample(X_train,Y_train) # Balancing training data
    model1.fit(X_train, Y_train) # Fitting LinearSVC
    Y_pred = model1.predict(X_test)
    score = metrics.precision_score(Y_test, Y_pred) # Calculating precision
    print("Cross-validation precison: ", score)
    scores.append(score) # appending cross-validation precision for each iteration
mean_precision = np.mean(scores)
print("Mean cross-validation precision: ", mean_precision)


# Implementing Naive Bayes Classifier
print("Implementing NBC.....")
nbc_clf = MultinomialNB()

#Running cross-validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1) # 10-fold cross-validation
scores=[]
iteration = 0
smote = SMOTE(random_state = 101) # Dataset is imbalanced hence SMOTE has to be used 
for train_index, test_index in kf.split(data_tfidf, Y):
    iteration += 1
    print("Iteration ", iteration)
    X_train, Y_train = data_tfidf[train_index], Y.iloc[train_index]
    X_test, Y_test = data_tfidf[test_index], Y.iloc[test_index]
    X_train,Y_train = smote.fit_resample(X_train,Y_train) # Balancing training data
    nbc_clf.fit(X_train, Y_train) # Fitting NBC
    Y_pred = nbc_clf.predict(X_test)
    score = metrics.precision_score(Y_test, Y_pred) # Calculating precision
    print("Cross-validation precision: ", score)
    scores.append(score) # appending cross-validation precision for each iteration
nbc_mean_precision = np.mean(scores)
print("Mean cross-validation precision: ", nbc_mean_precision)
# Saving our trained model into the sav file
data_tfidf,Y = smote.fit_resample(data_tfidf,Y)

# Comparing if SVC or NBC has the better result and saving the model with better score
# Will save svc only if it gives 10% better performance than NBC
if mean_precision > nbc_mean_precision:
  print("Saving SVC:")
  clf = LinearSVC().fit(data_tfidf, Y)
  joblib.dump(clf, 'Film_Reviews_model.sav')

# Else saving NBC if difference between performance of both is not large.
else:
  print("Ssaving NBC")
  clf = MultinomialNB().fit(data_tfidf, Y)
  joblib.dump(clf, 'Film_Reviews_model.sav')




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package wordnet to /root/nltk_data...


Printing top 5 rows of dataframe showing original and cleaned summaries....
                                                       Review  \
0           "This movie was fantastic, I loved every minute!"   
1          "The acting in this movie was terrible, avoid it."   
2              "I was on the edge of my seat the whole time!"   
3  "I couldn't finish watching this movie, it was so boring."   
4        "The cinematography in this movie was breathtaking."   

                             cleaned_Review  
0  [movie, fantastic, loved, every, minute]  
1          [acting, movie, terrible, avoid]  
2                 [edge, seat, whole, time]  
3         [finish, watching, movie, boring]  
4     [cinematography, movie, breathtaking]  
The created tokens: 
 ['absolutely' 'acting' 'acting forgettable' 'acting mediocre'
 'acting movie' 'acting phenomenal' 'acting superb' 'acting terrible'
 'acting top' 'acting top notch' 'acting wooden' 'acting wooden plot'
 'action' 'action adventure' 'act

In [None]:
pip install umap-learn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
#UMAP Dimensionality Reduction and Visualisation
# PART - 2 TRAINING SVC AND NBC MODEL ON THE FILM REVIEWS DATASET
import re, nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import umap
import plotly.graph_objs as go
import plotly.figure_factory as ff

# Reading dataset as dataframe
df = pd.read_csv("/content/Labelled.csv")
pd.set_option('display.max_colwidth', None) # Setting this so we can see the full content of cells
pd.set_option('display.max_columns', None) # to make sure we can see all the columns in output window
df['Label'] = df['Label'].map({'Positive':1, 'Negative':0})

# Cleaning Reviews
def cleaner(review):
    soup = BeautifulSoup(review, 'lxml') # removing HTML entities such as ‘&amp’,’&quot’,'&gt'; lxml is the html parser and shoulp be installed using 'pip install lxml'
    souped = soup.get_text()
    re1 = re.sub(r"(@|http://|https://|www|\\x)\S*", " ", souped) # substituting @mentions, urls, etc with whitespace
    re2 = re.sub("[^A-Za-z]+"," ", re1) # substituting any non-alphabetic character that repeats one or more times with whitespace

    """
    For more info on regular expressions visit -
    https://docs.python.org/3/howto/regex.html
    """

    tokens = nltk.word_tokenize(re2)
    lower_case = [t.lower() for t in tokens]

    stop_words = set(stopwords.words('english'))
    filtered_result = list(filter(lambda l: l not in stop_words, lower_case))

    wordnet_lemmatizer = WordNetLemmatizer()
    lemmas = [wordnet_lemmatizer.lemmatize(t) for t in filtered_result]   
    return lemmas

df['cleaned_review'] = df.Review.apply(cleaner)
df = df[df['cleaned_review'].map(len) > 0] # removing rows with cleaned summaries of length 0
print("Printing top 5 rows of dataframe showing original and cleaned summaries....")
print(df[['Review','cleaned_review']].head())
df['cleaned_review'] = [" ".join(row) for row in df['cleaned_review'].values] # joining tokens to create strings. TfidfVectorizer does not accept tokens as input
data = df['cleaned_review']
Y = df['Label'] # target column
tfidf = TfidfVectorizer(min_df=.004, ngram_range=(1,3)) # min_df=.004 means that each ngram (unigram, bigram, & trigram) must be present in at least 4 documents for it to be considered as a token (1000*.004=4). This is a clever way of feature engineering
tfidf.fit(data) # learn vocabulary of entire data
data_tfidf = tfidf.transform(data) # creating tfidf values
print("Shape of tfidf matrix: ", data_tfidf.shape)

# Implementing UMAP to visualize dataset
u = umap.UMAP(n_neighbors=50, min_dist=0.5) # Taking optimal values 50 and 0.5 as it gave some sub-clusters of similar kind of reviews on top left and bottom right hemisphere(top and bottom mostly).
x_umap = u.fit_transform(data_tfidf)

Label = list(df['Label'])
Review = list(df['Review'])

data_ = [go.Scatter(x=x_umap[:,0], y=x_umap[:,1], mode='markers',
                    marker = dict(color=df['Label'], colorscale='Rainbow', opacity=0.5),
                                text=[f'Label: {a}<br>Review: {b}' for a,b in list(zip(Label, Review))],
                                hoverinfo='text')]

layout = go.Layout(title = 'UMAP Dimensionality Reduction', width = 900, height = 900,
                    xaxis = dict(title='First Dimension'),
                    yaxis = dict(title='Second Dimension'))
fig = go.Figure(data=data_, layout=layout)
fig.show()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Printing top 5 rows of dataframe showing original and cleaned summaries....
                                                       Review  \
0           "This movie was fantastic, I loved every minute!"   
1          "The acting in this movie was terrible, avoid it."   
2              "I was on the edge of my seat the whole time!"   
3  "I couldn't finish watching this movie, it was so boring."   
4        "The cinematography in this movie was breathtaking."   

                             cleaned_review  
0  [movie, fantastic, loved, every, minute]  
1          [acting, movie, terrible, avoid]  
2                 [edge, seat, whole, time]  
3         [finish, watching, movie, boring]  
4     [cinematography, movie, breathtaking]  
Shape of tfidf matrix:  (1025, 578)


In [None]:
# PART- 3
# MODEL DEPLOYMENT using the vocabulary file, sav model file, and Unlabelled csv file
import re, nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('wordnet')
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

model = joblib.load('/content/Film_Reviews_model.sav')
vocabulary = pd.read_csv('/content/vocabulary_film_Review.csv', header=None)
vocabulary_dict = {}
for i, word in enumerate(vocabulary[0]):                              
      vocabulary_dict[word] = i
print(vocabulary_dict)
tfidf = TfidfVectorizer(vocabulary = vocabulary_dict,lowercase=False)

# Reading new data as dataframe
df = pd.read_csv("/content/Unlabelled.csv")
pd.set_option('display.max_colwidth', None) # Setting this so we can see the full content of cells
pd.set_option('display.max_columns', None) # to make sure we can see all the columns in output window

# Cleaning Reviewss
def cleaner(Reviews):
    soup = BeautifulSoup(Reviews, 'lxml') # removing HTML entities such as ‘&amp’,’&quot’,'&gt'; lxml is the html parser and shoulp be installed using 'pip install lxml'
    souped = soup.get_text()
    re1 = re.sub(r"(@|http://|https://|www|\\x)\S*", " ", souped) # substituting @mentions, urls, etc with whitespace
    re2 = re.sub("[^A-Za-z]+"," ", re1) # substituting any non-alphabetic character that repeats one or more times with whitespace

    tokens = nltk.word_tokenize(re2)
    lower_case = [t.lower() for t in tokens]

    stop_words = set(stopwords.words('english'))
    filtered_result = list(filter(lambda l: l not in stop_words, lower_case))

    wordnet_lemmatizer = WordNetLemmatizer()
    lemmas = [wordnet_lemmatizer.lemmatize(t) for t in filtered_result]  
    return lemmas

df['cleaned_Reviews'] = df.Reviews.apply(cleaner)
df = df[df['cleaned_Reviews'].map(len) > 0] # removing rows with cleaned tweets of length 0
print("Printing top 5 rows of dataframe showing original and cleaned tweets....")
print(df[['Reviews','cleaned_Reviews']].head())
df['cleaned_Reviews'] = [" ".join(row) for row in df['cleaned_Reviews'].values] # joining tokens to create strings. TfidfVectorizer does not accept tokens as input
data = df['cleaned_Reviews']
tfidf.fit(data)
data_tfidf = tfidf.transform(data)
y_pred = model.predict(data_tfidf)
#### Saving predicted ratings to csv
df['predicted_rating'] = y_pred.reshape(-1,1)
df.to_csv('predicted_rating.csv', index=False)


{'absolutely': 0, 'acting': 1, 'acting forgettable': 2, 'acting mediocre': 3, 'acting movie': 4, 'acting phenomenal': 5, 'acting superb': 6, 'acting terrible': 7, 'acting top': 8, 'acting top notch': 9, 'acting wooden': 10, 'acting wooden plot': 11, 'action': 12, 'action adventure': 13, 'action packed': 14, 'action packed adventure': 15, 'action scene': 16, 'action sequence': 17, 'actor': 18, 'added': 19, 'adventure': 20, 'amazing': 21, 'animation': 22, 'anime': 23, 'art': 24, 'attention': 25, 'avoid': 26, 'avoid cost': 27, 'away': 28, 'beautiful': 29, 'beginning': 30, 'beginning end': 31, 'best': 32, 'bland': 33, 'blown': 34, 'blown away': 35, 'bored': 36, 'boring': 37, 'breathtaking': 38, 'breathtaking story': 39, 'brought': 40, 'captivating': 41, 'catchy': 42, 'catchy choreography': 43, 'catchy choreography impressive': 44, 'character': 45, 'character bland': 46, 'character complex': 47, 'character complex story': 48, 'character development': 49, 'character forgettable': 50, 'charac

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
