import nltk
nltk.download('stopwords') 
nltk.download('wordnet')
nltk.download('movie_reviews')


In [9]:
import pandas as pd
import numpy as np

from nltk.corpus import stopwords, movie_reviews
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, accuracy_score

In [10]:
reviews = []
for fileid in movie_reviews.fileids():
    tag, filename = fileid.split("/")
    reviews.append((tag, movie_reviews.raw(fileid)))
df = pd.DataFrame(reviews, columns=['sentiment', 'review'])
print(f'This data has {df.shape} Dimensions.')
df.head()

This data has (2000, 2) Dimensions.


Unnamed: 0,sentiment,review
0,neg,"plot : two teen couples go to a church party ,..."
1,neg,the happy bastard's quick movie review \ndamn ...
2,neg,it is movies like these that make a jaded movi...
3,neg,""" quest for camelot "" is warner bros . ' firs..."
4,neg,synopsis : a mentally unstable man undergoing ...


In [11]:
# här ändrar vi 'pos' till 1 och 'neg' till 0
df['sentiment'] = np.where(df['sentiment'] == 'pos', 1, 0)
df['sentiment'].value_counts()

0    1000
1    1000
Name: sentiment, dtype: int64

In [12]:
X_train, X_test, y_train , y_test = train_test_split(df['review'], 
df['sentiment'],test_size=0.3, random_state=123)

print(f'The train dimensions: {X_train.shape, y_train.shape}')
print(f'The test dimensions: {X_test.shape, y_test.shape}')

# kolla in fördelningen mellan tranings och test data.
print(y_train.value_counts())
print(y_test.value_counts())

The train dimensions: ((1400,), (1400,))
The test dimensions: ((600,), (600,))
0    700
1    700
Name: sentiment, dtype: int64
0    300
1    300
Name: sentiment, dtype: int64


### pre processing data

In [13]:
def pre_processing(data):
    tokenizer = RegexpTokenizer(r'\w+') # tokenize words medan ignorerar punctuation
    tokens = tokenizer.tokenize(data)
    
    lemmatizer = WordNetLemmatizer() # lower och lemma
    lemmas = [lemmatizer.lemmatize(token.lower(), pos='v')
             for token in tokens]
    
    # tabort stop_words
    stop_words = [lemma for lemma in lemmas if lemma not in 
                  stopwords.words('english')]
    return stop_words

vectoriser = TfidfVectorizer(analyzer=pre_processing)
# fit och transform 
X_train_tf_idf = vectoriser.fit_transform(X_train)
X_train_tf_idf.shape

(1400, 27676)

### Modeling 

In [14]:
sgd_clf = SGDClassifier(random_state=123)
sgd_clf_Scores = cross_val_score(sgd_clf, X_train_tf_idf, y_train, cv=5)

print(f'SGD Scores:{sgd_clf_Scores}')
print("Accuracy: %0.3f (%0.3f)" % (sgd_clf_Scores.mean(),
                                       sgd_clf_Scores.std() * 2))

SGD Scores:[0.82857143 0.85       0.84285714 0.81785714 0.81428571]
Accuracy: 0.831 (0.028)


In [15]:
cross_val_score(sgd_clf, X_train_tf_idf, y_train, cv=5, scoring='accuracy')

array([0.82857143, 0.85      , 0.84285714, 0.81785714, 0.81428571])

In [16]:
sgd_clf_pred = cross_val_predict(sgd_clf, X_train_tf_idf, y_train, cv=5)
print("Confusion Matrix:\n", confusion_matrix(y_train, sgd_clf_pred))

Confusion Matrix:
 [[580 120]
 [117 583]]


### hitta den bästa algoritmen for att kontrollera lärningsprocessen

In [17]:
lr = {'fit_intercept': [True,False],
        'early_stopping': [True, False],
        'loss' : ['hinge', 'log', 'squared_hinge'],
        'penalty' : ['l2', 'l1', 'none']}
search = GridSearchCV(estimator=sgd_clf, param_grid=lr, cv=5)
search.fit(X_train_tf_idf, y_train)
search.best_params_

{'early_stopping': False,
 'fit_intercept': False,
 'loss': 'log',
 'penalty': 'l1'}

In [18]:
grid_sgd_clf_scores = cross_val_score(search.best_estimator_,
X_train_tf_idf, y_train, cv=5)
print(grid_sgd_clf_scores)
print('Accuracy: %0.2f (%0.2f)' % (grid_sgd_clf_scores.mean(),
                                       grid_sgd_clf_scores.std() * 2))

[0.85       0.85714286 0.83571429 0.84285714 0.82857143]
Accuracy: 0.84 (0.02)


### pipeline

In [19]:
p = Pipeline([('Vectorizer', vectoriser),
                ('Classifier', search.best_estimator_)])

p.fit(X_train, y_train)

Pipeline(steps=[('Vectorizer',
                 TfidfVectorizer(analyzer=<function pre_processing at 0x0000024E67898160>)),
                ('Classifier',
                 SGDClassifier(fit_intercept=False, loss='log', penalty='l1',
                               random_state=123))])

In [20]:
y_test_pred = p.predict(X_test)
print("Accuracy: %0.3f" % (accuracy_score(y_test, y_test_pred)))

Accuracy: 0.853


In [21]:
print("Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))

Confusion Matrix:
 [[249  51]
 [ 37 263]]
