### Sentimental Analysis of movies reviews using Logistic Regression

#### It is essential to convert the text into list so that each word can be classified into word, digit or any stopword.

In [1]:
#reading the data and converting it to list as train and test data

reviews_for_train= []
for line in open('/home/gauri/Downloads/movie_data/full_train.txt', 'r'):
    reviews_for_train.append(line.strip())
    
reviews_for_test = []
for line in open('/home/gauri/Downloads/movie_data/full_test.txt', 'r'):
    reviews_for_test.append(line.strip())

#### Data is cleaned to remove any unwanted symbols and words are then seperated by spaces or non spaces

In [6]:
import re #regular expression

REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])") #these do not require spaces
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)") #these require spaces

#adding spaces and no spaces
def preprocess_reviews(reviews):
    reviews = [REPLACE_NO_SPACE.sub("", line.lower()) for line in reviews]
    reviews = [REPLACE_WITH_SPACE.sub(" ", line) for line in reviews]
    return reviews

 #clean the dataset by passing it to preprocess_reviews                                     
reviews_train_clean = preprocess_reviews(reviews_for_train)
reviews_test_clean = preprocess_reviews(reviews_for_test)    

#### In order for this data to make sense to our machine learning algorithm we’ll need to convert each review to a numeric representation, which we call vectorization.



#### The simplest form of this is to create one very large matrix with one column for every unique word in your corpus (where the corpus is all 50k reviews in our case). Then we transform each review into one row containing 0s and 1s, where 1 means that the word in the corpus corresponding to that column appears in that review. That being said, each row of the matrix will be very sparse (mostly zeros). This process is also known as one hot encoding.



In [10]:
from sklearn.feature_extraction.text import CountVectorizer


cv = CountVectorizer(binary=True)
cv.fit(reviews_train_clean)
X= cv.transform(reviews_train_clean)
X_test = cv.transform(reviews_test_clean)

#### Now we classifythe data into categories with logistic regression and run it several ties for the model to lear and increase the accuracy of the prediction

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


target = [1 if i < 12500 else 0 for i in range(25000)]

X_train, X_val, y_train, y_val = train_test_split(
    X, target, train_size = 0.75
)

for c in [0.01, 0.05, 0.25, 0.5, 1, 1.25]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))



Accuracy for C=0.01: 0.87152
Accuracy for C=0.05: 0.88352
Accuracy for C=0.25: 0.88224
Accuracy for C=0.5: 0.87888
Accuracy for C=1: 0.87616
Accuracy for C=1.25: 0.87568


#### We can see that the accuracy is maximumwhen the value of c is 0.05
#### Hence we will train the entire dataset with the same value of c

In [19]:
final_model = LogisticRegression(C=0.05)
final_model.fit(X, target)
print ("Final Accuracy: %s" 
       % accuracy_score(target, final_model.predict(X_test)))

Final Accuracy: 0.88152


#### As a sanity check, let’s look at the 5 most discriminating words for both positive and negative reviews. We’ll do this by looking at the largest and smallest coefficients, respectively.



In [25]:
feature_to_coef = {
    word: coef for word, coef in zip(
        cv.get_feature_names(), final_model.coef_[0]
    )
}
for best_positive in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1], 
    reverse=True)[:5]:
    print (best_positive)

('excellent', 0.9292549160835142)
('perfect', 0.7907005816719882)
('great', 0.6745323532968273)
('amazing', 0.6127039979137073)
('superb', 0.6019368030663197)


In [26]:
for best_negative in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1])[:5]:
    print (best_negative)

('worst', -1.3645959137395889)
('waste', -1.1664242167922412)
('awful', -1.0324189587916497)
('poorly', -0.8752018809128563)
('boring', -0.8563543445904963)
