In [2]:
import numpy as np
import pandas as pd
import os
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [3]:
train = []
for line in open('train.txt', 'r'):
    
    train.append(line.strip())
    
test = []
for line in open('test.txt', 'r'):
    
    test.append(line.strip())
    
target = [1 if i < 12500 else 0 for i in range(25000)]

In [4]:
train[5]

"This isn't the comedic Robin Williams, nor is it the quirky/insane Robin Williams of recent thriller fame. This is a hybrid of the classic drama without over-dramatization, mixed with Robin's new love of the thriller. But this isn't a thriller, per se. This is more a mystery/suspense vehicle through which Williams attempts to locate a sick boy and his keeper.<br /><br />Also starring Sandra Oh and Rory Culkin, this Suspense Drama plays pretty much like a news report, until William's character gets close to achieving his goal.<br /><br />I must say that I was highly entertained, though this movie fails to teach, guide, inspect, or amuse. It felt more like I was watching a guy (Williams), as he was actually performing the actions, from a third person perspective. In other words, it felt real, and I was able to subscribe to the premise of the story.<br /><br />All in all, it's worth a watch, though it's definitely not Friday/Saturday night fare.<br /><br />It rates a 7.7/10 from...<br />

In [5]:
import re

REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\d+)")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
NO_SPACE = ""
SPACE = " "

def preprocess_reviews(reviews):
    
    reviews = [REPLACE_NO_SPACE.sub(NO_SPACE, line.lower()) for line in reviews]
    reviews = [REPLACE_WITH_SPACE.sub(SPACE, line) for line in reviews]
    
    return reviews

train_clean = preprocess_reviews(train)
test_clean = preprocess_reviews(test)

In [6]:
train_clean[5]

"this isn't the comedic robin williams nor is it the quirky insane robin williams of recent thriller fame this is a hybrid of the classic drama without over dramatization mixed with robin's new love of the thriller but this isn't a thriller per se this is more a mystery suspense vehicle through which williams attempts to locate a sick boy and his keeper also starring sandra oh and rory culkin this suspense drama plays pretty much like a news report until william's character gets close to achieving his goal i must say that i was highly entertained though this movie fails to teach guide inspect or amuse it felt more like i was watching a guy williams as he was actually performing the actions from a third person perspective in other words it felt real and i was able to subscribe to the premise of the story all in all it's worth a watch though it's definitely not friday saturday night fare it rates a   from the fiend "

# Базовая Векторизация

In [8]:
base_vectorizer = CountVectorizer(binary=True)
base_vectorizer.fit(train_clean)
X_base = base_vectorizer.transform(train_clean)
X_test_base = base_vectorizer.transform(test_clean)

X_train, X_val, y_train, y_val = train_test_split(
    X_base, target, train_size = 0.75
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))
    final_model = LogisticRegression(C=0.05)
final_model.fit(X_base, target)
print ("Final Accuracy: %s" 
       % accuracy_score(target, final_model.predict(X_test_base)))



Accuracy for C=0.01: 0.87216
Accuracy for C=0.05: 0.88288
Accuracy for C=0.25: 0.87888
Accuracy for C=0.5: 0.87488
Accuracy for C=1: 0.87328
Final Accuracy: 0.88168


# Удаление Бессмысленных Слов 

In [9]:
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
def delete_stop_words(corpus):
    deleted_stop_words = []
    for comment in corpus:
        deleted_stop_words.append(
            ' '.join([word for word in comment.split() 
                      if word not in stop_words])
        )
    return deleted_stop_words

no_stop_words_train = delete_stop_words(train_clean)
no_stop_words_test = delete_stop_words(test_clean)

cv = CountVectorizer(binary=True)
cv.fit(no_stop_words_train)
X = cv.transform(no_stop_words_train)
X_test = cv.transform(no_stop_words_test)

X_train, X_val, y_train, y_val = train_test_split(
    X, target, train_size = 0.75
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))

Accuracy for C=0.01: 0.876
Accuracy for C=0.05: 0.8816
Accuracy for C=0.25: 0.876
Accuracy for C=0.5: 0.8728
Accuracy for C=1: 0.87008


In [10]:
no_stop_words_train[5]

"comedic robin williams quirky insane robin williams recent thriller fame hybrid classic drama without dramatization mixed robin's new love thriller thriller per se mystery suspense vehicle williams attempts locate sick boy keeper also starring sandra oh rory culkin suspense drama plays pretty much like news report william's character gets close achieving goal must say highly entertained though movie fails teach guide inspect amuse felt like watching guy williams actually performing actions third person perspective words felt real able subscribe premise story worth watch though definitely friday saturday night fare rates fiend"

# Стемминг 

In [11]:
def get_stemmed_text(corpus):
    from nltk.stem.porter import PorterStemmer
    stemmer = PorterStemmer()

    return [' '.join([stemmer.stem(word) for word in review.split()]) for review in corpus]

stemmed_train = get_stemmed_text(train_clean)
stemmed_test = get_stemmed_text(test_clean)

cv = CountVectorizer(binary=True)
cv.fit(stemmed_train)
X = cv.transform(stemmed_train)
X_test = cv.transform(stemmed_test)

X_train, X_val, y_train, y_val = train_test_split(
    X, target, train_size = 0.75
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))
    
final_stemmed = LogisticRegression(C=0.05)
final_stemmed.fit(X, target)
print ("Final Accuracy: %s" 
       % accuracy_score(target, final_stemmed.predict(X_test)))

Accuracy for C=0.01: 0.86384
Accuracy for C=0.05: 0.87552
Accuracy for C=0.25: 0.87184
Accuracy for C=0.5: 0.86672
Accuracy for C=1: 0.86288
Final Accuracy: 0.87748


In [12]:
stemmed_train[5]

"thi isn't the comed robin william nor is it the quirki insan robin william of recent thriller fame thi is a hybrid of the classic drama without over dramat mix with robin' new love of the thriller but thi isn't a thriller per se thi is more a mysteri suspens vehicl through which william attempt to locat a sick boy and hi keeper also star sandra oh and rori culkin thi suspens drama play pretti much like a news report until william' charact get close to achiev hi goal i must say that i wa highli entertain though thi movi fail to teach guid inspect or amus it felt more like i wa watch a guy william as he wa actual perform the action from a third person perspect in other word it felt real and i wa abl to subscrib to the premis of the stori all in all it' worth a watch though it' definit not friday saturday night fare it rate a from the fiend"

# Лемматизация

In [13]:
def get_lemmatized_text(corpus):
    
    from nltk.stem import WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    return [' '.join([lemmatizer.lemmatize(word) for word in review.split()]) for review in corpus]

lemmatized_train = get_lemmatized_text(train_clean)
lemmatized_test = get_lemmatized_text(test_clean)

cv = CountVectorizer(binary=True)
cv.fit(lemmatized_train)
X = cv.transform(lemmatized_train)
X_test = cv.transform(lemmatized_test)

X_train, X_val, y_train, y_val = train_test_split(
    X, target, train_size = 0.75
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))
    
final_lemmatized = LogisticRegression(C=0.25)
final_lemmatized.fit(X, target)
print ("Final Accuracy: %s" 
       % accuracy_score(target, final_lemmatized.predict(X_test)))

Accuracy for C=0.01: 0.86688
Accuracy for C=0.05: 0.88336
Accuracy for C=0.25: 0.88144
Accuracy for C=0.5: 0.87888
Accuracy for C=1: 0.87648
Final Accuracy: 0.87444


In [14]:
lemmatized_train[5]

"this isn't the comedic robin williams nor is it the quirky insane robin williams of recent thriller fame this is a hybrid of the classic drama without over dramatization mixed with robin's new love of the thriller but this isn't a thriller per se this is more a mystery suspense vehicle through which williams attempt to locate a sick boy and his keeper also starring sandra oh and rory culkin this suspense drama play pretty much like a news report until william's character get close to achieving his goal i must say that i wa highly entertained though this movie fails to teach guide inspect or amuse it felt more like i wa watching a guy williams a he wa actually performing the action from a third person perspective in other word it felt real and i wa able to subscribe to the premise of the story all in all it's worth a watch though it's definitely not friday saturday night fare it rate a from the fiend"

# n-грамма

In [15]:
ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 2))
ngram_vectorizer.fit(train_clean)
X = ngram_vectorizer.transform(train_clean)
X_test = ngram_vectorizer.transform(test_clean)

X_train, X_val, y_train, y_val = train_test_split(
    X, target, train_size = 0.75
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))
    final_ngram = LogisticRegression(C=0.5)
final_ngram.fit(X, target)
print ("Final Accuracy: %s" 
       % accuracy_score(target, final_ngram.predict(X_test)))

Accuracy for C=0.01: 0.89024
Accuracy for C=0.05: 0.89936
Accuracy for C=0.25: 0.90096
Accuracy for C=0.5: 0.90128
Accuracy for C=1: 0.9008
Final Accuracy: 0.898


# Метод опорных векторов (SVM)

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

ngram_vector = CountVectorizer(binary=True, ngram_range=(1, 2))
ngram_vector.fit(train_clean)
X = ngram_vector.transform(train_clean)
X_test = ngram_vector.transform(test_clean)

X_train, X_val, y_train, y_val = train_test_split(
    X, target, train_size = 0.75
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    svm = LinearSVC(C=c)
    svm.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, svm.predict(X_val))))
final_svm_ngram = LinearSVC(C=0.01)
final_svm_ngram.fit(X, target)
print ("Final Accuracy: %s" 
       % accuracy_score(target, final_svm_ngram.predict(X_test)))

Accuracy for C=0.01: 0.89392
Accuracy for C=0.05: 0.8928




Accuracy for C=0.25: 0.89056
Accuracy for C=0.5: 0.89072
Accuracy for C=1: 0.89056
Final Accuracy: 0.8974


# Финальная Версия

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC


stop_words = ['in', 'of', 'at', 'a', 'the']
ngram_vector = CountVectorizer(binary=True, ngram_range=(1, 3), stop_words=stop_words)
ngram_vector.fit(train_clean)
X = ngram_vector.transform(train_clean)
X_test = ngram_vector.transform(test_clean)

X_train, X_val, y_train, y_val = train_test_split(
    X, target, train_size = 0.75
)

for c in [0.001, 0.005, 0.01, 0.05, 0.1]:
    
    svm = LinearSVC(C=c)
    svm.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, svm.predict(X_val))))

final = LinearSVC(C=0.01)
final.fit(X, target)
print ("Final Accuracy: %s" 
       % accuracy_score(target, final.predict(X_test)))

Accuracy for C=0.001: 0.8864
Accuracy for C=0.005: 0.892
Accuracy for C=0.01: 0.89232
Accuracy for C=0.05: 0.89264
Accuracy for C=0.1: 0.89328
Final Accuracy: 0.90064


# Тестирование классификации слов по их тональности

In [18]:
feature_to_coef = {
    word: coef for word, coef in zip(
        ngram_vector.get_feature_names(), final.coef_[0]
    )
}

for best_positive in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1], 
    reverse=True)[:10]:
    print (best_positive)
    
print("\n\n")
for best_negative in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1])[:10]:
    print (best_negative)

('excellent', 0.22932149768891325)
('perfect', 0.18456041969094136)
('great', 0.1789748534389449)
('wonderful', 0.16014961633969868)
('amazing', 0.15411678321008684)
('superb', 0.1469075636918959)
('enjoyable', 0.14346762609379118)
('best', 0.13042560065203518)
('today', 0.12939426815994295)
('fun', 0.12682166020477575)



('worst', -0.35899087131912844)
('awful', -0.2550574632816644)
('boring', -0.24068186241109035)
('waste', -0.2368369745571283)
('bad', -0.2218196683543501)
('poor', -0.20193935295260387)
('terrible', -0.19984464244924263)
('dull', -0.18413720022589303)
('poorly', -0.175340690248661)
('disappointment', -0.17488533844028098)
