# Classification of News Synopsis

There are 20 classes of News synopsis for this dataset. Each class has many text documents.   
The goal of this project:
- Use the training data to learn a text document classifier and apply this classifier on all the test data to have them properly labeled 
- Compare the model label of each test doc to the test gold label and compute the evaluation. Each sub dir’s name is the gold label for all the text documents under it.

In [10]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import nltk
import re
import os
import glob
import errno

In [11]:
# Modify the name of the folders which contain '.'

path_train = 'data/train'
path_test = 'data/test'
def changename(path):
    for file in os.listdir(path):
        name = os.path.basename(file)
        new_name = []
        for item in name:
            if item == '.':
                new_name.append('_')
            else:
                new_name.append(item)
        os.rename(path + '/' + name, path + '/' + ''.join(new_name))
        
changename(path_train)
changename(path_test)

In [14]:
# Trasform data to csv format
def trandf(path):
    file = glob.glob(path)
    list = []
    for i in range(len(file)):
        new_path = file[i] + '/*'
        file1 = glob.glob(new_path)
        list1 = []
        for f in file1:
            content = open(f).read()
            list1.append(pd.DataFrame(data = {'Content': [content], 'Label' : [i]}))
        comb = pd.concat(list1)
        list.append(comb)
    raw_data = pd.concat(list)
    raw_data.to_csv('raw_data.csv')
    df = pd.read_csv('raw_data.csv')
    df = df.drop('Unnamed: 0', 1)
    return df
path_train1 = 'data/train/*'
path_test1 = 'data/test/*'
df_train = trandf(path_train1)
df_test = trandf(path_test1)
df_test.head()

Unnamed: 0,Content,Label
0,From: matt@galaxy.nsc.com (Matt Freivald x8043...,0
1,From: Clinton-HQ@Campaign92.Org (The White Hou...,0
2,From: onr@netcom.com (D. Owen Rowley)\nSubject...,0
3,From: asper@calvin.uucp (Alan E. Asper)\nSubje...,0
4,From: munoz@sweetpea.jsc.nasa.gov (tomas o mun...,0


In [15]:
#import nltk to do tokenize and steming
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
print ('We use ' + str(len(stopwords)) + ' stop-words form nltk library.')
print (stopwords[:10])

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/souyixin/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
We use 179 stop-words form nltk library.
[u'i', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'you', u"you're"]


In [16]:
from nltk.stem.snowball import SnowballStemmer
nltk.download('punkt')
stemmer = SnowballStemmer("english")
# Define tokenization and steming bys self
def tokenization_and_stemming(text):
    # exclude stop words and tokenize the document, generate a list of string 
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent) if word not in stopwords]

    filtered_tokens = []
    
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
            
    # stemming
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems


[nltk_data] Downloading package punkt to /Users/souyixin/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [17]:
#CountVectorizer and TFIDF
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_model = TfidfVectorizer(max_df=0.8,max_features = 234585,
                                 min_df=0, stop_words='english',
                                 use_idf=True, tokenizer=tokenization_and_stemming, ngram_range=(1,3))

Features_train = tfidf_model.fit_transform(df_train['Content'])
Features_test = tfidf_model.transform(df_test['Content'])
print(Features_test)

  (0, 234212)	0.0156475619930839
  (0, 234048)	0.014648041461525459
  (0, 233461)	0.042976684048947764
  (0, 233221)	0.022714941974135414
  (0, 233166)	0.009662823667840199
  (0, 233145)	0.017491545568779736
  (0, 231535)	0.04447825039282544
  (0, 230144)	0.021488342024473882
  (0, 230132)	0.018972566377083725
  (0, 228906)	0.008487761719172118
  (0, 228006)	0.020221279760818993
  (0, 227850)	0.02223912519641272
  (0, 227666)	0.02678761843114096
  (0, 227474)	0.009348811140361337
  (0, 225812)	0.04086034515176468
  (0, 225811)	0.04086034515176468
  (0, 225799)	0.025137743593085064
  (0, 225205)	0.01124645369764776
  (0, 224288)	0.020905989353605034
  (0, 224287)	0.040442559521637986
  (0, 224286)	0.04005575044503936
  (0, 223674)	0.013757947579735318
  (0, 223011)	0.015865168562402252
  (0, 222710)	0.021488342024473882
  (0, 222546)	0.033529694299523405
  :	:
  (7760, 34705)	0.07076644657002731
  (7760, 34704)	0.06928408035416041
  (7760, 33979)	0.06513081038331649
  (7760, 33937)	0.04

In [18]:
Features_test.shape

(7761, 234585)

In [19]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

#Cross validation
models = [RandomForestClassifier(n_estimators = 200, max_depth = 3, random_state = 0), MultinomialNB(), LogisticRegression(random_state = 0)]
CV = 5
cv_df = pd.DataFrame(index = range(CV * len(models)))
entries = []
for model in models:
    model_name = model.__class__.__name__
    scores = cross_val_score(model, Features_train, df_train['Label'], scoring = 'accuracy', cv = CV)
    entries.append((model_name, scores.mean()))
cv_df = pd.DataFrame(entries, columns = ['model_name',  'accuracy'])

In [20]:
print(cv_df)

               model_name  accuracy
0  RandomForestClassifier  0.293515
1           MultinomialNB  0.873051
2      LogisticRegression  0.899670


In [21]:
# Grid Search
from sklearn.model_selection import GridSearchCV
def print_grid_search_metrics(gs):
    print ("Best score: %0.3f" % gs.best_score_)
    print ("Best parameters set:")
    best_parameters = gs.best_params_
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

parameters = {
    'penalty':('l1', 'l2'), 
    'C':(1, 5, 10)
}
Grid_LR = GridSearchCV(LogisticRegression(),parameters, cv=5, n_jobs = -1)
Grid_LR.fit(Features_train, df_train['Label'])


GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'penalty': ('l1', 'l2'), 'C': (1, 5, 10)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [22]:
print_grid_search_metrics(Grid_LR)

Best score: 0.921
Best parameters set:
	C: 10
	penalty: 'l2'


In [23]:
best_LR_model = Grid_LR.best_estimator_

In [24]:
parameters = dict( alpha = np.linspace(0,2,20)[1:] )
Grid_NB = GridSearchCV(MultinomialNB(), parameters, cv = 5, n_jobs = -1)
Grid_NB.fit(Features_train, df_train['Label'])


GridSearchCV(cv=5, error_score='raise',
       estimator=MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'alpha': array([0.10526, 0.21053, 0.31579, 0.42105, 0.52632, 0.63158, 0.73684,
       0.84211, 0.94737, 1.05263, 1.15789, 1.26316, 1.36842, 1.47368,
       1.57895, 1.68421, 1.78947, 1.89474, 2.     ])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [25]:
print_grid_search_metrics(Grid_NB)

Best score: 0.904
Best parameters set:
	alpha: 0.10526315789473684


In [26]:
best_NB_model = Grid_NB.best_estimator_

In [27]:
from sklearn.metrics import f1_score
f1_score_LR = f1_score(df_test['Label'], best_LR_model.predict(Features_test), average = 'macro')
f1_score_NB = f1_score(df_test['Label'], best_NB_model.predict(Features_test), average = 'macro')


In [28]:
print(f1_score_NB)

0.7805364042230845


In [29]:
print(f1_score_LR)

0.8366473050006917


In [30]:
from sklearn import metrics
print(metrics.classification_report(df_test['Label'], best_LR_model.predict(Features_test)))

             precision    recall  f1-score   support

          0       0.86      0.65      0.74       310
          1       0.89      0.83      0.86       395
          2       1.00      0.77      0.87       827
          3       0.75      0.87      0.81       390
          4       0.98      0.91      0.94       376
          5       0.72      0.96      0.82       397
          6       0.73      0.77      0.75       392
          7       0.92      0.91      0.92       394
          8       0.78      0.92      0.84       364
          9       0.84      0.93      0.88       398
         10       0.71      0.96      0.82       198
         11       0.73      0.79      0.76       389
         12       0.78      0.65      0.71       251
         13       0.92      0.90      0.91       395
         14       0.86      0.76      0.81       319
         15       0.96      0.94      0.95       398
         16       0.83      0.87      0.85       385
         17       0.79      0.80      0.79   

In [31]:
print(metrics.classification_report(df_test['Label'], best_NB_model.predict(Features_test)))


             precision    recall  f1-score   support

          0       0.86      0.65      0.74       310
          1       0.85      0.84      0.85       395
          2       1.00      0.34      0.50       827
          3       0.77      0.82      0.80       390
          4       0.94      0.95      0.94       376
          5       0.47      0.96      0.63       397
          6       0.63      0.77      0.69       392
          7       0.85      0.90      0.88       394
          8       0.75      0.91      0.82       364
          9       0.76      0.95      0.84       398
         10       0.57      0.93      0.71       198
         11       0.73      0.73      0.73       389
         12       0.84      0.50      0.63       251
         13       0.87      0.90      0.89       395
         14       0.84      0.77      0.80       319
         15       0.93      0.92      0.93       398
         16       0.83      0.82      0.82       385
         17       0.81      0.73      0.77   