In [1]:
"""
Evaluate information gain/ mutual information of additional features.
"""

'\nEvaluate information gain/ mutual information of additional features.\n'

In [2]:
import pymongo 
import pandas as pd 
import numpy as np
import random
import json 
import seaborn as sns
import dns

In [3]:
client = pymongo.MongoClient('mongodb+srv://eoghan:Ailbhe123@fypcluster-cqcwt.mongodb.net/test?retryWrites=true&w=majority')

In [4]:
db = client.beta_db
comments = db.comments

In [5]:
labelled_comments = list(comments.find({'$or' :[{'label' : {'$exists' : 'true'}}, {'queried' : 1}]}))

In [6]:
df = pd.DataFrame(labelled_comments)

In [7]:
# dictionary for mapping route grades to numbers
british_grades_dict = {'M': 0,'D': 0,'HD' : 0,'VD' : 0,'HVD' : 0,'MS' : 0, 
                       'S' : 1,'HS' : 2,'MVS' : 3,'VS' : 4,'HVS' : 5, 'ED1' : 5,
                       'E1' : 6,'E2' : 7,'E3' : 8,'E4' : 9,'E5' : 10,
                       'E6' : 11,'E7' : 12,'E8' : 13,'E9' : 14,'E10' : 14,
                       'E11' : 15, 'XS': 16, 'HXS' : 17, 'none': np.nan}

annotators = [col for col in df.columns if 'annotator' in col]

In [8]:
df['max_grade_of_climber'] = df['max_grade of climber'].map(lambda x: british_grades_dict[x] if type(x) is str else x)
df['route_grade'] = df['route_grade'].map(lambda x: british_grades_dict[x] if type(x) is str else x)
df['challenge'] = df['route_grade'] - df['max_grade_of_climber']
df['challenge'] = df['challenge'].fillna(df.challenge.mean())
df['is_local'] = (df['local_to'] == df['location']).map(lambda x : 1 if x else 0)
df['comment_len'] = df.comment.map(len)

In [9]:
df['annotation'] = (df[annotators].mean(axis = 1)+0.01).round()
print(df['annotation'].value_counts())
print(df['label'].value_counts())

0.0    62
1.0    38
Name: annotation, dtype: int64
0.0    179
1.0    125
Name: label, dtype: int64


In [10]:
df['label'] = df.label.add(df.annotation, fill_value = 0)
print(df['label'].value_counts())

0.0    241
1.0    163
Name: label, dtype: int64


In [11]:
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_extraction.text import CountVectorizer
from scipy import sparse

In [12]:
df_no_nan = df[['comment','challenge','is_local','comment_len','label']].dropna()
features = ['challenge', 'is_local', 'comment_len']

In [22]:
# information gain for additional features
info = mutual_info_classif(df_no_nan[features].as_matrix(),
                    df_no_nan['label'], discrete_features = [1], random_state = 1)
print(pd.Series(info, index = features).to_latex(float_format = "%.4f"))
pd.Series(info, index = features)

\begin{tabular}{lr}
\toprule
{} &      0 \\
\midrule
challenge   & 0.0175 \\
is\_local    & 0.0015 \\
comment\_len & 0.1825 \\
\bottomrule
\end{tabular}



  


challenge      0.017523
is_local       0.001505
comment_len    0.182499
dtype: float64

In [14]:
print("mean 'challenge' for positive comments: {}".format(df[df['label'] == 1].challenge.mean()))
print("mean 'challenge' for negative comments: {}".format(df[df['label'] == 0].challenge.mean()))

mean 'challenge' for positive comments: -2.5503171467193515
mean 'challenge' for negative comments: -2.9518250228567413


In [18]:
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk 
nltk.download('punkt')

class LemmaTokenizer:
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    
    def __call__(self, doc):
        words = nltk.word_tokenize(doc)
        new_words= [word for word in words if word.isalnum()]
        return [self.wnl.lemmatize(t) for t in new_words]

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/eoghancunningham/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [21]:
# information gain for textual features
X,Y = df.comment, df.label
cv = CountVectorizer(min_df=5,stop_words='english', ngram_range=(1,3), tokenizer=LemmaTokenizer())
X_vec = cv.fit_transform(X)

res = pd.Series(mutual_info_classif(X_vec, Y, discrete_features=True),index = cv.get_feature_names(),)
print(res.sort_values(ascending = False).iloc[:5].to_latex(float_format = "%.4f"))
print("Top five informative text features : ")
res.sort_values(ascending = False).iloc[:5]

\begin{tabular}{lr}
\toprule
{} &      0 \\
\midrule
right & 0.0790 \\
reach & 0.0686 \\
crack & 0.0672 \\
foot  & 0.0604 \\
left  & 0.0584 \\
\bottomrule
\end{tabular}

Top five informative text features : 


right    0.078956
reach    0.068579
crack    0.067228
foot     0.060379
left     0.058406
dtype: float64