## Text Classification Using TF-IDF
The dataset consists of 2225 documents from the BBC news website corresponding to stories in five topical areas from 2004-2005.

* business
* entertainment
* politics
* sport
* tech

Downloaded from here: http://mlg.ucd.ie/datasets/bbc.html

In [1]:
import pandas as pd
import ast
import os
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.externals import joblib


In [2]:
root_path = '/Users/guilhermede-oliveira/Library/DataScienceStudio/dss_design_v5/managed_folders/BBCARTICLES/hZFXEdPS/BBC'


In [5]:
# load articles into a Pandas DataFrame

#path = '/Users/guilhermede-oliveira/Library/DataScienceStudio/dss_design_v5/managed_folders/BBCARTICLES/hZFXEdPS/BBC'

results = []
for article_class in os.listdir(root_path):
    folder_path = os.path.join(root_path, article_class)
    print(folder_path)
    if not os.path.isdir(folder_path):
        continue
    else:
        #print article_class, path
        articles = os.listdir(folder_path)
        print(len(articles))
        for article in sorted(articles):
            #print article
            article_path = os.path.join(folder_path, article)
            #print article_path
            with open(article_path, 'r') as f:
                text = f.readlines()
            #print text
            results.append([article_class, article, text])

df = pd.DataFrame(results, columns=['label', 'file_name', 'raw_text'])
print(df.shape)
df.head(3)

FileNotFoundError: [Errno 2] No such file or directory: '/Users/guilhermede-oliveira/Library/DataScienceStudio/dss_design_v5/managed_folders/BBCARTICLES/hZFXEdPS/BBC'

In [4]:
# load each article into a variable X, perform some preprocessing, ie
# remove line-breaks ('\n')
# this is an area where a lot of additional preprocessing could be added
# for instance you could remove numbers, extra spaces, etc...
X = df['raw_text'].values
X = [' '.join(x).replace('\n', '') for x in X]
print(len(X))

2225


In [5]:
# encode the labels into the y-variable
labels = df['label'].unique().tolist()
labels = {label:k for k,label in enumerate(labels)}
print(labels)
y = df['label'].map(labels).values


{'politics': 3, 'sport': 2, 'tech': 4, 'business': 1, 'entertainment': 0}


## TF-IDF vectorization

In [6]:
# in the TF-IDF below, we are getting rid of english stop words, we are considering unigrams and bigrams, 
# we are pruning the dictionary (max_df, min_df), and we are ignoring decoding errors

vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2), max_df=0.8, min_df=40, decode_error='ignore')

X_vectorized = vectorizer.fit_transform(X)
print(X_vectorized.shape)


(2225, 1852)


In [6]:
# these are the first 100 tokens in the vectorized documents
print(vectorizer.get_feature_names()[:100])


NameError: name 'vectorizer' is not defined

# ## Train/Validation Split


In [8]:
# split the data into a training/test set and a validation set

X_train, X_val, y_train, y_val = train_test_split(X_vectorized, y, test_size=0.15, random_state=42)

print(X_vectorized.shape)
print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)


(2225, 1852)
(1891, 1852) (1891,)
(334, 1852) (334,)


## Train a Random Forest Classifier Using Grid Search


In [9]:
rfc = RandomForestClassifier()

params = {"n_estimators":[100],
          "max_depth": [3, 6, 9, 12, 15],
          "min_samples_split": [10, 20, 40],
          "criterion": ["gini", "entropy"],
          "min_samples_leaf": [10, 20, 40]
         }

grid_search = GridSearchCV(estimator=rfc, param_grid=params, cv=3)
print(grid_search)


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [100], 'min_samples_split': [10, 20, 40], 'criterion': ['gini', 'entropy'], 'max_depth': [3, 6, 9, 12, 15], 'min_samples_leaf': [10, 20, 40]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)


In [10]:
grid_search.fit(X_vectorized, y)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [100], 'min_samples_split': [10, 20, 40], 'criterion': ['gini', 'entropy'], 'max_depth': [3, 6, 9, 12, 15], 'min_samples_leaf': [10, 20, 40]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [11]:
print(grid_search.best_params_)


{'min_samples_split': 20, 'n_estimators': 100, 'criterion': 'entropy', 'max_depth': 12, 'min_samples_leaf': 10}


In [12]:
print(grid_search.best_score_)


0.917752808988764


## Evaluate Model Performance on the Validation Set


In [13]:
# apply the model to the validation set

y_pred = grid_search.predict(X_val)


In [14]:
print(y_pred.shape, y_val.shape)
print(sum(y_pred == y_val))


(334,) (334,)
321


In [15]:
# this means out of 334 records in the validation set, 319 were predicted correctly
319/334.

0.9550898203592815

## Write pickled model and model parameters to Folder

In [16]:
model_path = '/Users/guilhermede-oliveira/Documents/Columbia/Fall 2018 - Capstone/Text Classification'

pickle_file_path = os.path.join(model_path, 'random_forest_classifier.pkl')
param_file_path = os.path.join(model_path, 'model_parameters.json')

print(pickle_file_path)
print(param_file_path)


/Users/guilhermede-oliveira/Documents/Columbia/Fall 2018 - Capstone/Text Classification/random_forest_classifier.pkl
/Users/guilhermede-oliveira/Documents/Columbia/Fall 2018 - Capstone/Text Classification/model_parameters.json


In [17]:
predictor = grid_search.best_estimator_
model_parameters = predictor.get_params()


In [18]:
# write pickled model to folder
joblib.dump(predictor, pickle_file_path)



['/Users/guilhermede-oliveira/Documents/Columbia/Fall 2018 - Capstone/Text Classification/random_forest_classifier.pkl']

In [19]:
# write model paramters to folder
with open(param_file_path, 'w') as fp:
    json.dump(model_parameters, fp, sort_keys=True, indent=4)


## Feature Importance

In [20]:
predictor.feature_importances_


array([0.00129982, 0.        , 0.0003993 , ..., 0.        , 0.        ,
       0.        ])

In [21]:
fi = pd.DataFrame(predictor.feature_importances_,
                  index = vectorizer.get_feature_names(),
                  columns=['importance']).sort_values('importance',ascending=False)


In [22]:
fi['feature'] = fi.index
fi = fi[['feature', 'importance']]
print(fi.shape)

fi.head()


(1852, 2)


Unnamed: 0,feature,importance
mr,mr,0.036499
market,market,0.028711
government,government,0.028004
game,game,0.024744
users,users,0.022529


In [23]:
# This tells us the tokens in the text that had the most important effect on the prediction