### Document Classification

In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd

import warnings; warnings.simplefilter('ignore')
import os, codecs, string, random
import numpy as np
from numpy.random import seed as random_seed
from numpy.random import shuffle as random_shuffle
import matplotlib.pyplot as plt
%matplotlib inline  

seed = 42
random.seed(seed)
np.random.seed(seed)

#NLP libraries
import spacy, nltk, gensim, sklearn
import pyLDAvis.gensim_models


#Scikit imports
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV


#### 1: obtain a large collection of documents and labels it classes : recover sample fof quotation from quotabank and manualy label it climate or not 

We already have quotations related to climates, now we will extract some random quotation from each years and then construte manually a set of quotation related to climate or not each coming from different years. 


In [47]:
dico={} #We create a dictonary to loop over our years. 
for date in [2020, 2019, 2018, 2017, 2016, 2015]:
    dico[date] = pd.read_json(f'data/quotes-{date}.json.bz2', lines=True, compression='bz2', chunksize=100000)

In [49]:
def chunk_filtering(chunk, i):
    
    template=[] #Creation of an empty list :it's always cheaper to append to a list and create a DataFrame than append on a empty dataframe
    template.append(chunk.loc[i:i]) #selecte one quotation per chunk 
    return(pd.concat(template, ignore_index=True))# return a dataframe with our data of interest

In [50]:
for date, df in dico.items() : 
    m=1
    for i, chunk in enumerate(df) : 
        chunk_clean=chunk_filtering(chunk, m) #recover interested row of the chunk
        header = i == 0 #we kept the name of the column only for the first chunk
        mode = 'w' if i == 0 else 'a' # For appending data to an existing CSV file (so for every chunk exepct the first one), 
                                        #we can use mode = a
        m+=100000    
        chunk_clean.to_csv(path_or_buf=f"data/clean_quotes-sample-{date}.bz2",compression='bz2',header=header, mode=mode, index = False ) #Load to CSV

In [None]:
 df_reader =  pd.read_json('data/quotes-2020.json.bz2', lines=True, compression='bz2', chunksize=100000)
m=1
for i, chunk in enumerate(df_reader) :
    chunk_clean=chunk_filtering(chunk, m) #recover interested row of the chunk
    header = i == 0 #we kept the name of the column only for the first chunk
    mode = 'w' if i == 0 else 'a' # For appending data to an existing CSV file (so for every chunk exepct the first one), 
                                        #we can use mode = a
    m+=100000
    chunk_clean.to_csv(path_or_buf="data/clean_quotes-2020-Sample.bz2",compression='bz2',header=header, mode=mode, index = False ) #Load to CSV.

In [2]:
dico_random_quotes={} 
length = 0
for date in [2020, 2018, 2017, 2016, 2015]:
    dico_random_quotes[date] = pd.read_csv(f'data/clean_quotes-sample-{date}.bz2', compression='bz2')
    length += len(dico_random_quotes[date]) #The length iss used here to obtain the total number of quotes. 

In [6]:
dico_clean={} 
length = 0
for date in [2020, 2019, 2018, 2017, 2016, 2015]:
    dico_clean[date] = pd.read_csv(f'data/clean_quotes-{date}.bz2', compression='bz2')
    length += len(dico_clean[date]) #The length is used here to obtain the total number of quotes. 

In [7]:
# create a corpus based on the csv file 
print(length)

131472


Let's create a subdata containing quotations from Quotebank data and label it climate or not. To do so, we decide to groups together random quotes and pre-selecte climate related quotes. Then, we will filtering the duplicates (to be sure that random quotes does not have quotes about climate) and assigne 1 to climate if the quotes comes from the pre-selected data from Milestone 2 or 0

In [13]:
m =[]
#for date , data in dico_clean.items() : m.append(data.sample(30)) #add quotation about the climate
    
for date, data in dico_random_quotes.items() : m.append(data.sample(80, replace=True)) #add random quotation

subdata =pd.concat(m)

Filtering our subdata

In [14]:
subdata.drop_duplicates(subset=['quotation'], inplace=True) 
subdata.dropna(subset=['quotation'], inplace=True)

In [15]:
subdata.shape

(305, 9)

In [21]:
subdata[subdata['quotation'].apply(lambda x : '' in x)]

Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,probas,urls,phase


Mask quotation to climate or not

In [11]:
subdata['climate']=0
subdata.iloc[:179]['climate']=1

Let's see if quotation related to climate are ok

In [16]:
subdata.to_csv('data/test.csv')

Regarding ourself the quotations it seems that they all abord climate subject.

In [12]:
subdata_train, subdata_test, label_train, label_test = train_test_split(subdata.quotation, subdata.climate, shuffle=True, test_size=0.2, random_state=42)

#### Create our bag of word 

In [30]:
vectorizer = CountVectorizer(stop_words = 'english',ngram_range=(1, 2),lowercase=False)

#initialize and specify minumum number of occurences to avoid untractable number of features
#vectorizer = CountVectorizer(min_df = 2) if we want high frequency

#create bag of words features
X = vectorizer.fit_transform(subdata.quotation)


print('Number of samples:',X.toarray().shape[0])
print('Number of features:',X.toarray().shape[1])

#mask and convert to int Frankenstein
Y = np.array(subdata.climate)



print(Y.shape)
print(X.shape)
#shuffle the data

X, Y = shuffle(X, Y, random_state=0)

#split into training and test set
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

Number of samples: 430
Number of features: 8306
(430,)
(430, 8306)


--> more features than sample, may induce high variance

In [107]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train)
X_test_tfidf=tfidf_transformer.fit_transform(X_test)
#transform the count matrix X_train to a normalized tf-idf representation
X_train_tfidf.shape 
X_test_tfidf

<86x8306 sparse matrix of type '<class 'numpy.float64'>'
	with 2260 stored elements in Compressed Sparse Row format>

In [108]:
train = [X_train, X_train_tfidf]
test = [X_test, X_test_tfidf]

#### 3  : fit to different models

Text files are actually series of words. In order to run machine learning algorithms we need to convert the text files into numerical feature vectors. We will be using bag of words model for our example. Briefly, we segment each text file into words (for English splitting by space), and count # of times each word occurs in each document and finally assign each word an integer id. Each unique word in our dictionary will correspond to a feature (descriptive feature).
Scikit-learn has a high level component which will create feature vectors for us ‘CountVectorizer’. 
Then we will train our feature with different model and seek for the better. 

To do so, we need first to find the best parameters for CountVectorize, decide if we need to transform our feature (with TF-IDF), and check for hyperparameters tunning with cross validation. 
We will use Pipeline and GridSearchCSV. 

NB : we don't look for n_grams = (1,1) because we thought that a list of simple word cannot underligne the subject an can induce biaise (for exemple : it's better for us to get 'y.. than ...)

### Let's fit some models

##### Logistic Regression 

In [137]:
penalty = ['l1', 'l2', 'elasticnet'] 
C = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000] 
random_state=[0]
solver=['lbfgs']

param_grid = dict(penalty=penalty, 
C=C, random_state=random_state, solver=solver) 

logistic = LogisticRegression() 

grid = GridSearchCV(estimator=logistic, param_grid=param_grid, scoring='roc_auc', verbose=1, n_jobs=-1, cv = 10) 

for features in (train) : 
    grid_result = grid.fit(features, Y_train) 
    print('Best Score: ', grid_result.best_score_) 
    print('Best Params: ', grid_result.best_params_) 

Fitting 10 folds for each of 24 candidates, totalling 240 fits
Best Score:  0.9955340136054422
Best Params:  {'C': 10, 'penalty': 'l2', 'random_state': 0, 'solver': 'lbfgs'}
Fitting 10 folds for each of 24 candidates, totalling 240 fits
Best Score:  0.9934319727891158
Best Params:  {'C': 0.0001, 'penalty': 'l2', 'random_state': 0, 'solver': 'lbfgs'}


We can see that X_train seems to give a better accuracy than tf_idf transfrom vector and that the best parameters for c equals 1. However, as we get many features compare to the number of document a simple logistic rgression may induce an high variance and so it may not be the best option

#### SVM

In [136]:
penalty = ['l1', 'l2', 'elasticnet'] 
alpha = [1e-4, 1e-3, 1e-2, 1e-1, 1e0]
loss=['log', 'hinge']
max_iter=[10,100,10000]
n_jobs = [-1]




param_grid = dict(penalty=penalty, 
alpha=alpha,loss=loss,max_iter=max_iter, n_jobs=n_jobs) 

logistic = SGDClassifier() 

grid = GridSearchCV(estimator=logistic, param_grid=param_grid, scoring='roc_auc', verbose=1, n_jobs=-1, cv = 10) 

for features in (train) : 
    grid_result = grid.fit(features, Y_train) 
    print('Best Score: ', grid_result.best_score_) 
    print('Best Params: ', grid_result.best_params_) 

Fitting 10 folds for each of 90 candidates, totalling 900 fits
Best Score:  0.9941394557823129
Best Params:  {'alpha': 0.01, 'loss': 'hinge', 'max_iter': 10, 'n_jobs': -1, 'penalty': 'l2'}
Fitting 10 folds for each of 90 candidates, totalling 900 fits
Best Score:  0.9951428571428572
Best Params:  {'alpha': 0.001, 'loss': 'hinge', 'max_iter': 10000, 'n_jobs': -1, 'penalty': 'elasticnet'}


#### Let's get our model and see what word reflect climate topic !

In [133]:
clf = LogisticRegression(random_state=0, solver='lbfgs',C = 10, penalty='l2').fit(X_train,Y_train)
predicted = clf.predict(X_test)
print('Accuracy:{}'.format(np.mean(predicted == Y_test)))

Accuracy:0.9418604651162791


In [134]:
coefs=clf.coef_[0]
top_three = np.argpartition(coefs, -20)[-20:]
print(np.array(vectorizer.get_feature_names())[top_three])

['long' 'business' 'global warming' 'greenhouse' 'fossil fuels' 'fuels'
 'fossil' 'warming' 'climate' 'renewable energy' 'business usual'
 'renewable' 'climate change' 'change' 'global' 'energy' 'emissions'
 'usual' 'CO2' 'carbon']


In [44]:
coefs=clf.coef_[0]
top_three = np.argpartition(coefs, -20)[-26:]
print(np.array(vectorizer.get_feature_names())[top_three])

['gas' 'greenhouse gases' 'gases' 'need' 'future' 'long' 'use'
 'global warming' 'climate' 'fossil' 'business usual' 'business'
 'greenhouse' 'climate change' 'fuels' 'usual' 'warming' 'energy' 'CO2'
 'global' 'change' 'renewable' 'emissions' 'carbon' 'renewable energy'
 'fossil fuels']


##### SVM

In [139]:
clf = SGDClassifier(loss="hinge", penalty="l2", alpha = 0.01,   max_iter=10, n_jobs=-1).fit(X_train, Y_train)
predicted = clf.predict(X_test)
print('Accuracy:{}'.format(np.mean(predicted == Y_test)))



coefs=clf.coef_[0]
top_three = np.argpartition(coefs, -18)[-30:]

print(np.array(vectorizer.get_feature_names())[top_three])

Accuracy:0.9651162790697675
['deliver' 'long' 'future' 'fight global' 'gas' 'lead' 'gases'
 'greenhouse gases' 'We need' 'environment' 'use' 'global warming'
 'global' 'greenhouse' 'fossil' 'business usual' 'fossil fuels' 'warming'
 'fuels' 'business' 'emissions' 'usual' 'CO2' 'renewable' 'energy'
 'climate' 'change' 'climate change' 'renewable energy' 'carbon']


In [98]:
clf = SGDClassifier(loss="hinge", penalty="elasticnet", alpha = 1e-3,   max_iter=10000, n_jobs=-1).fit(X_train_tfidf, Y_train)
predicted = clf.predict(X_test_tfidf)
print('Accuracy:{}'.format(np.mean(predicted == Y_test)))



coefs=clf.coef_[0]
top_three = np.argpartition(coefs, -18)[-30:]

print(np.array(vectorizer.get_feature_names())[top_three])

Accuracy:0.9302325581395349
['gas' 'content share' 'business' 'greenhouse gas' 'greenhouse gases'
 'global warming' 'contend' 'gases' 'content post' 'content' 'need'
 'future' 'warming' 'global' 'use' 'fossil fuels' 'carbon' 'change' 'CO2'
 'emissions' 'fuels' 'usual' 'fossil' 'energy' 'business usual' 'climate'
 'renewable' 'renewable energy' 'greenhouse' 'climate change']


In [47]:
oefs=clf.coef_[0]
top_three = np.argpartition(coefs, -20)[-25:]

print(np.array(vectorizer.get_feature_names())[top_three])

['carbon dioxide' 'national' 'gases' 'greenhouse gas' 'greenhouse gases'
 'gas' 'future' 'global' 'warming' 'emissions' 'fuels' 'CO2'
 'fossil fuels' 'fossil' 'use' 'usual' 'business usual' 'greenhouse'
 'renewable energy' 'renewable' 'carbon' 'climate change' 'climate'
 'change' 'energy']


Fitting 10 folds for each of 24 candidates, totalling 240 fits
Best Score:  0.9955340136054422
Best Params:  {'C': 10, 'penalty': 'l2'}
Fitting 10 folds for each of 24 candidates, totalling 240 fits
Best Score:  0.9934319727891158
Best Params:  {'C': 0.0001, 'penalty': 'l2'}


In [None]:
penalty = ['l1', 'l2', 'elasticnet'] 
C = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000] 


param_grid = dict(penalty=penalty, 
C=C) 

logistic = LogisticRegression() 

grid = GridSearchCV(estimator=logistic, param_grid=param_grid, scoring='roc_auc', verbose=1, n_jobs=-1) 

grid_result = grid.fit(X_train, Y_train) 

print('Best Score: ', grid_result.best_score_) 
print('Best Params: ', grid_result.best_params_) 

## Test Random forest

In [13]:
from sklearn.model_selection import train_test_split
# implementing train-test-split
X_train, X_test, y_train, y_test = train_test_split(subdata.quotation, subdata.climate, test_size=0.33, random_state=66)

In [16]:
from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier
# random forest model creation
rfc = RandomForestClassifier()
rfc.fit(X_train,y_train)
# predictions
rfc_predict = rfc.predict(X_test)

ValueError: could not convert string to float: 'To my knowledge, these kind of symbiont-engulfing worms are more widespread around deep-ocean seeps than other seep-associated organisms and the nature of how they acquire nutrients through that symbiosis would extend the known habitat for seep ecosystems and our appreciation for how methane supports deep-sea ecosystems.'