In [1]:
import pandas as pd
data=pd.read_csv('./Peterpan.csv')
data=data.dropna(how='any')
print(f'Dimensions: {data.shape}')
data

Dimensions: (2026, 4)


Unnamed: 0.1,Unnamed: 0,sentence,posNeg,bi_posNeg
0,0,Chapter 1 PETER BREAKS THROUGH,0.000000,0.0
1,1,"All children, except one, grow up. They soon k...",0.311111,1.0
2,2,Of course they lived at 14 [their house number...,0.279545,1.0
3,3,The way Mr. Darling won her was this: the many...,0.483333,1.0
4,4,Mr. Darling used to boast to Wendy that her mo...,0.148889,1.0
...,...,...,...,...
2021,2021,or filename 24689 would be found at:,0.000000,0.0
2022,2022,http://www.gutenberg.org/2/4/6/8/24689,0.000000,0.0
2023,2023,An alternative method of locating eBooks:,0.000000,0.0
2024,2024,http://www.gutenberg.org/GUTINDEX.ALL,0.000000,0.0


In [2]:
import pandas as pd
import nltk
from nltk.corpus import movie_reviews, stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, accuracy_score

nltk.download('movie_reviews')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     /Users/seawavve/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/seawavve/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/seawavve/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
#Partitioning data
X_train, X_test, y_train, y_test = train_test_split(data['sentence'], data['bi_posNeg'], test_size=0.3, random_state=123)
print(f'Train dimensions: {X_train.shape, y_train.shape}')
print(f'Test dimensions: {X_test.shape, y_test.shape}')
# Check out target distribution
print(y_train.value_counts())
print(y_test.value_counts())

Train dimensions: ((1418,), (1418,))
Test dimensions: ((608,), (608,))
0.0    947
1.0    471
Name: bi_posNeg, dtype: int64
0.0    436
1.0    172
Name: bi_posNeg, dtype: int64


In [4]:
def preprocess_text(text):
    # Tokenise words while ignoring punctuation
    tokeniser = RegexpTokenizer(r'\w+')
    tokens = tokeniser.tokenize(text)
    
    # Lowercase and lemmatise 
    lemmatiser = WordNetLemmatizer()
    lemmas = [lemmatiser.lemmatize(token.lower(), pos='v') for token in tokens]
    
    # Remove stop words
    keywords= [lemma for lemma in lemmas if lemma not in stopwords.words('english')]
    return keywords

# Create an instance of TfidfVectorizer
vectoriser = TfidfVectorizer(analyzer=preprocess_text)
# Fit to the data and transform to feature matrix
X_train_tfidf = vectoriser.fit_transform(X_train)
X_train_tfidf.shape

(1418, 3380)

In [5]:
sgd_clf = SGDClassifier(random_state=123)
sgf_clf_scores = cross_val_score(sgd_clf, X_train_tfidf, y_train, cv=5)
print(sgf_clf_scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (sgf_clf_scores.mean(), sgf_clf_scores.std() * 2))

[0.77816901 0.78169014 0.76408451 0.74558304 0.78798587]
Accuracy: 0.77 (+/- 0.03)


In [6]:
cross_val_score(sgd_clf, X_train_tfidf, y_train, cv=5, scoring='accuracy')

array([0.77816901, 0.78169014, 0.76408451, 0.74558304, 0.78798587])

In [7]:
sgf_clf_pred = cross_val_predict(sgd_clf, X_train_tfidf, y_train, cv=5)
print(confusion_matrix(y_train, sgf_clf_pred))

[[825 122]
 [202 269]]


In [8]:
grid = {'fit_intercept': [True,False],
        'early_stopping': [True, False],
        'loss' : ['hinge', 'log', 'squared_hinge'],
        'penalty' : ['l2', 'l1', 'none']}
search = GridSearchCV(estimator=sgd_clf, param_grid=grid, cv=5)
search.fit(X_train_tfidf, y_train)
search.best_params_

{'early_stopping': False,
 'fit_intercept': True,
 'loss': 'log',
 'penalty': 'l1'}

In [9]:
grid_sgd_clf_scores = cross_val_score(search.best_estimator_, X_train_tfidf, y_train, cv=5)
print(grid_sgd_clf_scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (grid_sgd_clf_scores.mean(), grid_sgd_clf_scores.std() * 2))

[0.79225352 0.79577465 0.78873239 0.75265018 0.795053  ]
Accuracy: 0.78 (+/- 0.03)


In [10]:
pipe = Pipeline([('vectoriser', vectoriser),
                 ('classifier', search.best_estimator_)])
pipe.fit(X_train, y_train)

Pipeline(steps=[('vectoriser',
                 TfidfVectorizer(analyzer=<function preprocess_text at 0x12f64db80>)),
                ('classifier',
                 SGDClassifier(loss='log', penalty='l1', random_state=123))])

In [11]:
y_test_pred = pipe.predict(X_test)
print("Accuracy: %0.2f" % (accuracy_score(y_test, y_test_pred)))
print(confusion_matrix(y_test, y_test_pred))

Accuracy: 0.82
[[389  47]
 [ 65 107]]
