In [118]:
import pandas as pd

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2
import numpy as np


from sklearn.linear_model import LogisticRegression
from pprint import pprint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import ShuffleSplit
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier


# import sys
# if not sys.warnoptions:
#     import warnings
#     warnings.simplefilter("ignore")    

Load Data

In [58]:

df = pd.read_fwf('trainingdata.txt', delim_whitespace = True, header = None)

df = df.rename(columns = {df.columns[0]:'text'})
df = df[['text']][1:]

df = pd.DataFrame(df['text'].str.split(" ", 1).tolist(), columns = ['category','text'])

df.head()

Unnamed: 0,category,text
0,1,champion products ch approves stock split cham...
1,2,computer terminal systems cpml completes sale ...
2,1,cobanco inc cbco year net shr cts vs dlrs net ...
3,1,am international inc am nd qtr jan oper shr lo...
4,1,brown forman inc bfd th qtr net shr one dlr vs...


Text Cleaning

In [63]:
# change to lower case
df['text'] = df['text'].str.lower()

# remove punctuation
punctuation_signs = list("?:!.,;")

for punct_sign in punctuation_signs:
    df['text'] = df['text'].str.replace(punct_sign, '')
    
# remove possessive pronoun terminations
df['text'] = df['text'].str.replace("'s", "")



df.head()

Unnamed: 0,category,text
0,1,champion products ch approves stock split cham...
1,2,computer terminal systems cpml completes sale ...
2,1,cobanco inc cbco year net shr cts vs dlrs net ...
3,1,am international inc am nd qtr jan oper shr lo...
4,1,brown forman inc bfd th qtr net shr one dlr vs...


Stemming and Lemmatization

In [65]:
import nltk
nltk.download('wordnet')


[nltk_data] Downloading package wordnet to /Users/lin/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [66]:
wordnet_lemmatizer = WordNetLemmatizer()

nrows = len(df)
lemmatized_text_list = []

for row in range(0, nrows):
    
    # Create an empty list containing lemmatized words
    lemmatized_list = []
    
    # Save the text and its words into an object
    text = df.loc[row]['text']
    text_words = text.split(" ")

    # Iterate through every word to lemmatize
    for word in text_words:
        lemmatized_list.append(wordnet_lemmatizer.lemmatize(word, pos="v"))
        
    # Join the list
    lemmatized_text = " ".join(lemmatized_list)
    
    # Append to the list containing the texts
    lemmatized_text_list.append(lemmatized_text)
    
df['text'] = lemmatized_text_list

In [67]:
df.head()

Unnamed: 0,category,text
0,1,champion products ch approve stock split champ...
1,2,computer terminal systems cpml complete sale c...
2,1,cobanco inc cbco year net shr cts vs dlrs net ...
3,1,be international inc be nd qtr jan oper shr lo...
4,1,brown forman inc bfd th qtr net shr one dlr vs...


stop words

In [68]:
# Downloading the stop words list
nltk.download('stopwords')

stop_words = list(stopwords.words('english'))

for stop_word in stop_words:
    regex_stopword = r"\b" + stop_word + r"\b"
    df['text'] = df['text'].str.replace(regex_stopword, '')

[nltk_data] Downloading package stopwords to /Users/lin/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [69]:
df.head()

Unnamed: 0,category,text
0,1,champion products ch approve stock split champ...
1,2,computer terminal systems cpml complete sale c...
2,1,cobanco inc cbco year net shr cts vs dlrs net ...
3,1,international inc nd qtr jan oper shr loss t...
4,1,brown forman inc bfd th qtr net shr one dlr vs...


Feature Engineering

In [70]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], 
                                                    df['category'], 
                                                    test_size=0.15, 
                                                    random_state=8)

In [171]:
# Parameter election
ngram_range = (1,2)
# min_df = 10
# max_df = 1.
max_features = 300

In [172]:
tfidf = TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                        stop_words=None,
                        lowercase=False,
#                         max_df=max_df,
#                         min_df=min_df,
                        max_features=max_features,
                        norm='l2',
                        sublinear_tf=True)
                        
features_train = tfidf.fit_transform(X_train).toarray()
labels_train = y_train
print(features_train.shape)

features_test = tfidf.transform(X_test).toarray()
labels_test = y_test
print(features_test.shape)

(4662, 300)
(823, 300)


Multinomial Logistic Regression

In [76]:
lr = LogisticRegression(random_state = 1)

print('Parameters currently in use:\n')
pprint(lr.get_params())

Parameters currently in use:

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'max_iter': 100,
 'multi_class': 'warn',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': 1,
 'solver': 'warn',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}


In [88]:

# C
C = [float(x) for x in np.linspace(start = 0.1, stop = 1, num = 10)]

# multi_class
multi_class = ['multinomial']

# solver
solver = ['newton-cg', 'sag', 'saga', 'lbfgs']
 
# class_weight
class_weight = ['balanced', None]

# penalty
penalty = ['l2']

# Create the random grid
random_grid = {'C': C,
               'multi_class': multi_class,
               'solver': solver,
               'class_weight': class_weight,
               'penalty': penalty}

pprint(random_grid)

{'C': [0.1,
       0.2,
       0.30000000000000004,
       0.4,
       0.5,
       0.6,
       0.7000000000000001,
       0.8,
       0.9,
       1.0],
 'class_weight': ['balanced', None],
 'multi_class': ['multinomial'],
 'penalty': ['l2'],
 'solver': ['newton-cg', 'sag', 'saga', 'lbfgs']}


In [95]:

# First create the base model to tune
lr = LogisticRegression(random_state=1)

# Definition of the random search
random_search = RandomizedSearchCV(estimator=lr,
                                   param_distributions=random_grid,
                                   n_iter=50,
                                   scoring='accuracy',
                                   cv=3, 
                                   verbose=1, 
                                   random_state=1)

# Fit the random search model
random_search.fit(features_train, labels_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:  2.8min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=1, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=50, n_jobs=None,
          param_distributions={'C': [0.1, 0.2, 0.30000000000000004, 0.4, 0.5, 0.6, 0.7000000000000001, 0.8, 0.9, 1.0], 'multi_class': ['multinomial'], 'solver': ['newton-cg', 'sag', 'saga', 'lbfgs'], 'class_weight': ['balanced', None], 'penalty': ['l2']},
          pre_dispatch='2*n_jobs', random_state=1, refit=True,
          return_train_score='warn', scoring='accuracy', verbose=1)

In [96]:
print("The best hyperparameters from Random Search are:")
print(random_search.best_params_)
print("")
print("The mean accuracy of a model with these hyperparameters is:")
print(random_search.best_score_)

The best hyperparameters from Random Search are:
{'solver': 'sag', 'penalty': 'l2', 'multi_class': 'multinomial', 'class_weight': None, 'C': 1.0}

The mean accuracy of a model with these hyperparameters is:
0.9455169455169455


After that, we can do a more exhaustive search centered in those values:

Grid Search Cross Validation

In [108]:
# Create the parameter grid based on the results of random search 
C = [float(x) for x in np.linspace(start = 0.6, stop = 1, num = 10)]
multi_class = ['multinomial']
solver = ['sag']
class_weight = ['balanced']
penalty = ['l2']

param_grid = {'C': C,
               'multi_class': multi_class,
               'solver': solver,
               'class_weight': class_weight,
               'penalty': penalty}

# Create a base model
lrc = LogisticRegression(random_state=8)

# Manually create the splits in CV in order to be able to fix a random_state (GridSearchCV doesn't have that argument)
cv_sets = ShuffleSplit(n_splits = 2, test_size = .33, random_state = 1)

# Instantiate the grid search model
grid_search = GridSearchCV(estimator=lrc, 
                           param_grid=param_grid,
                           scoring='accuracy',
                           cv=cv_sets,
                           verbose=1)

# Fit the grid search to the data
grid_search.fit(features_train, labels_train)


Fitting 2 folds for each of 10 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:   57.0s finished


GridSearchCV(cv=ShuffleSplit(n_splits=2, random_state=1, test_size=0.33, train_size=None),
       error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=8, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [0.6, 0.6444444444444444, 0.6888888888888889, 0.7333333333333333, 0.7777777777777778, 0.8222222222222222, 0.8666666666666667, 0.9111111111111111, 0.9555555555555555, 1.0], 'multi_class': ['multinomial'], 'solver': ['sag'], 'class_weight': ['balanced'], 'penalty': ['l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=1)

In [109]:
best_lrc = grid_search.best_estimator_

best_lrc

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='multinomial', n_jobs=None, penalty='l2',
          random_state=8, solver='sag', tol=0.0001, verbose=0,
          warm_start=False)

In [110]:
best_lrc.fit(features_train, labels_train)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='multinomial', n_jobs=None, penalty='l2',
          random_state=8, solver='sag', tol=0.0001, verbose=0,
          warm_start=False)

In [111]:
lrc_pred = best_lrc.predict(features_test)

In [112]:
# Training accuracy
print("The training accuracy is: ")
print(accuracy_score(labels_train, best_lrc.predict(features_train)))

The training accuracy is: 
0.9081939081939082


In [113]:
# Test accuracy
print("The test accuracy is: ")
print(accuracy_score(labels_test, lrc_pred))

The test accuracy is: 
0.9125151883353585


In [114]:
print("Classification report")
print(classification_report(labels_test,lrc_pred))

Classification report
              precision    recall  f1-score   support

           1       0.98      0.93      0.95       414
           2       0.85      0.98      0.91       243
           3       0.96      0.96      0.96        54
           4       0.36      0.45      0.40        11
           5       0.00      0.00      0.00         4
           6       0.86      0.93      0.89        41
           7       0.87      0.62      0.72        21
           8       0.84      0.60      0.70        35

   micro avg       0.91      0.91      0.91       823
   macro avg       0.72      0.68      0.69       823
weighted avg       0.91      0.91      0.91       823



Gradient Boosting Machine

Cross-Validation for Hyperparameter tuning

In [119]:
gb = GradientBoostingClassifier(random_state = 1)

print('Parameters currently in use:\n')
pprint(gb.get_params())

Parameters currently in use:

{'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'deviance',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'presort': 'auto',
 'random_state': 1,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}


In [121]:
# Create the parameter grid based on the results of random search 
max_depth = [5, 10, 15]
max_features = ['sqrt']
min_samples_leaf = [2]
min_samples_split = [50, 100]
n_estimators = [800]
learning_rate = [.1, .5]
subsample = [1.]

param_grid = {
    'max_depth': max_depth,
    'max_features': max_features,
    'min_samples_leaf': min_samples_leaf,
    'min_samples_split': min_samples_split,
    'n_estimators': n_estimators,
    'learning_rate': learning_rate,
    'subsample': subsample

}

# Create a base model
gbc = GradientBoostingClassifier(random_state=8)

# Manually create the splits in CV in order to be able to fix a random_state (GridSearchCV doesn't have that argument)
cv_sets = ShuffleSplit(n_splits = 2, test_size = .33, random_state = 8)

# Instantiate the grid search model
grid_search = GridSearchCV(estimator=gbc, 
                           param_grid=param_grid,
                           scoring='accuracy',
                           cv=cv_sets,
                           verbose=1)

# Fit the grid search to the data
grid_search.fit(features_train, labels_train)

Fitting 2 folds for each of 12 candidates, totalling 24 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  24 out of  24 | elapsed:  9.5min finished


GridSearchCV(cv=ShuffleSplit(n_splits=2, random_state=8, test_size=0.33, train_size=None),
       error_score='raise-deprecating',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_sampl...      subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_depth': [5, 10, 15], 'max_features': ['sqrt'], 'min_samples_leaf': [2], 'min_samples_split': [50, 100], 'n_estimators': [800], 'learning_rate': [0.1, 0.5], 'subsample': [1.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=1)

In [123]:
best_gbc = grid_search.best_estimator_

best_gbc

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=10,
              max_features='sqrt', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=2, min_samples_split=50,
              min_weight_fraction_leaf=0.0, n_estimators=800,
              n_iter_no_change=None, presort='auto', random_state=8,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [124]:
best_gbc.fit(features_train, labels_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=10,
              max_features='sqrt', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=2, min_samples_split=50,
              min_weight_fraction_leaf=0.0, n_estimators=800,
              n_iter_no_change=None, presort='auto', random_state=8,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [125]:
gbc_pred = best_gbc.predict(features_test)

In [126]:
print("The training accuracy is: ")
print(accuracy_score(labels_train, best_gbc.predict(features_train)))

The training accuracy is: 
0.9987129987129987


In [127]:
# Test accuracy
print("The test accuracy is: ")
print(accuracy_score(labels_test, gbc_pred))

The test accuracy is: 
0.9550425273390036


In [130]:


df2 = pd.read_fwf('stdin.txt', delim_whitespace = True, header = None)

df2 = df2.rename(columns = {df2.columns[0]:'text'})
df2 = df2[['text']][1:]


df2.head()

Unnamed: 0,text
1,This is a document
2,this is another document
3,documents are seperated by


In [142]:
# change to lower case
df2['text'] = df2['text'].str.lower()

# remove punctuation
punctuation_signs = list("?:!.,;")

for punct_sign in punctuation_signs:
    df2['text'] = df2['text'].str.replace(punct_sign, '')
    
# remove possessive pronoun terminations
df2['text'] = df2['text'].str.replace("'s", "")


In [141]:
stop_words = list(stopwords.words('english'))

for stop_word in stop_words:
    regex_stopword = r"\b" + stop_word + r"\b"
    df2['text'] = df2['text'].str.replace(regex_stopword, '')

In [143]:
wordnet_lemmatizer = WordNetLemmatizer()

nrows = len(df2)
lemmatized_text_list = []

for row in range(0, nrows):
    
    # Create an empty list containing lemmatized words
    lemmatized_list = []
    
    
    # Save the text and its words into an object
    text = df.loc[row]['text']
    text_words = text.split(" ")

    # Iterate through every word to lemmatize
    for word in text_words:
        lemmatized_list.append(wordnet_lemmatizer.lemmatize(word, pos="v"))
        
    # Join the list
    lemmatized_text = " ".join(lemmatized_list)
    
    # Append to the list containing the texts
    lemmatized_text_list.append(lemmatized_text)
    
df2['text'] = lemmatized_text_list

In [175]:
X_new = df2['text']

tfidf = TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                        stop_words=None,
                        lowercase=False,
#                         max_df=max_df,
#                         min_df=min_df,
                        max_features=300,
                        norm='l2',
                        sublinear_tf=True)
                        
features_new = tfidf.fit_transform(X_new).toarray()

# print(features_new)
# best_gbc.predict(features_new)

In [176]:
best_gbc.predict(features_new)

array(['2', '1', '1'], dtype=object)