In [2]:
#Import dependencies and Downloading dataset
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sentiment_tokenizer import Tokenizer
import pickle
from sklearn.metrics import precision_score,recall_score,f1_score,confusion_matrix
tok=Tokenizer(preserve_case=False)
import requests,io,os,tarfile
r=requests.get('http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz')
with open('aclImdb_v1.tar.gz','wb') as f:
  f.write(r.content)
z=tarfile.open('/content/aclImdb_v1.tar.gz','r:gz')
os.mkdir('/content/imdb_movie_review')
z.extractall('/content/imdb_movie_review')

In [3]:
#Split dataset into input and output
def create_input_output(file):
  # dataset=[]
  input_data=[]
  output_data=[]
  for folder in os.listdir(file):
    if(folder=='pos'):
      for data in os.listdir(file+'/'+folder+"/"):
        f=open(file+'/'+folder+'/'+data,'r')
        data=f.read()
        # dataset.append({
        #     folder:data
        # })
        input_data.append(data)
        output_data.append(folder)
    if(folder=='neg'):
      for data in os.listdir(file+'/'+folder+"/"):
        f=open(file+'/'+folder+'/'+data,'r')
        data=f.read()
        # dataset.append({
        #     folder:data
        # })
        input_data.append(data)
        output_data.append(folder)
  return input_data,output_data

In [4]:
def preprocessing(sent):
  train_review=[]
  for review in sent:
    s=''
    words=tok.tokenize(review)
    tokenized_words=[]
    for word in words:
      tokenized_words.append(word)
    j=0
    while(j<len(tokenized_words)):
      k=0
      if(("n't" in tokenized_words[j]) or("not" in tokenized_words[j]) or (tokenized_words[j]=='.')):
        s+=" "+tokenized_words[j]
        index=j+1
        i=index
        while(i<len(tokenized_words)):
          if(tokenized_words[i]!='but' and tokenized_words[i]!='and' and tokenized_words[i]!='.' and tokenized_words[i]!=','):
            s+=" NOT_"+tokenized_words[i]
            i+=1
            k=i
          else:
            s+=" "+tokenized_words[i]
            i+=1
            k=i
            break
      else:
        s+=" "+tokenized_words[j]
      if(k==0):
        j+=1
      else:
        j=k
    train_review.append(s)
  return train_review

In [5]:
def vectorize(input_data):
  vect=TfidfVectorizer()
  vect=vect.fit(input_data)
  input_data=vect.transform(input_data)
  return vect,input_data

In [6]:
def label_encoding(output_data):
  le=LabelEncoder()
  output_data=le.fit_transform(output_data)
  return le,output_data

In [7]:
#SVM
from sklearn import svm
def model_support_vector_machine(input_data,output_data):
  model_svm=svm.SVC(gamma='scale')
  model_svm.fit(input_data,output_data)
  return model_svm

In [8]:
def get_test_data(file):
  test_dataset=[]
  test_input_data=[]
  test_output_data=[]
  for folder in os.listdir(file):
    if(folder=='pos'):
      for data in os.listdir(file+'/'+folder+"/"):
        f=open(file+'/'+folder+'/'+data,'r')
        data=f.read()
        test_dataset.append({
            folder:data
        })
        test_input_data.append(data)
        test_output_data.append(folder)
    if(folder=='neg'):
      for data in os.listdir(file+'/'+folder+"/"):
        f=open(file+'/'+folder+'/'+data,'r')
        data=f.read()
        test_dataset.append({
            folder:data
        })
        test_input_data.append(data)
        test_output_data.append(folder)
  return test_input_data,test_output_data

In [9]:
def evaluating_model(Y_true,Y_pred):
  matrix=confusion_matrix(Y_true,Y_pred)
  precision=precision_score(Y_true,Y_pred)
  recall=recall_score(Y_true,Y_pred)
  f1=f1_score(Y_true,Y_pred)
  return matrix,precision,recall,f1

In [10]:
print("*************Training Time***********************")
input_data,output_data=create_input_output("/content/imdb_movie_review/aclImdb/train")
input_data=preprocessing(input_data)
vect,input_data=vectorize(input_data)
le,output_data=label_encoding(output_data)

print("Training SVM model:\n")
model_svm=model_support_vector_machine(input_data,output_data)
print("SVM model trained successfully\n")
print("Training Accuracy:",model_svm.score(input_data,output_data))

print("*************Testing Time***********************")
test_input_data,test_output_data=get_test_data("/content/imdb_movie_review/aclImdb/test")
test_input_data=preprocessing(test_input_data)
test_input_data=vect.transform(test_input_data)
test_output_data=le.transform(test_output_data)

print("Evaluating SVM model:\n")
print("Testing Accuracy:",model_svm.score(test_input_data,test_output_data))
predicted_value=model_svm.predict(test_input_data)
matrix,precision,recall,f1=evaluating_model(test_output_data,predicted_value)
print("Confusion matrix:- ",matrix)
print("Precision:- ",precision)
print("Recall:- ",recall)
print("f1 score:- ",f1)

*************Training Time***********************
Training SVM model:

SVM model trained successfully

*************Testing Time***********************
Evaluating SVM model:

Accuracy: 0.88636
Confusion matrix:-  [[11104  1396]
 [ 1445 11055]]
Precision:-  0.8878804915267849
Recall:-  0.8844
f1 score:-  0.8861368281832391


In [None]:
with open('model.pkl','wb') as f:
  pickle.dump(model_svm,f)
with open('vect.pkl','wb') as f:
  pickle.dump(vect,f)
with open('le.pkl','wb') as f:
  pickle.dump(le,f)