In [None]:
import pip
pip.main(['install', '--user', 'nltk'])

import warnings
with warnings.catch_warnings():
    warnings.filterwarnings("ignore",category=FutureWarning)
    import sklearn
    from sklearn.feature_extraction import DictVectorizer
    import sklearn.feature_extraction.text
    from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
    from sklearn import metrics


In [None]:
import numpy as np
import pandas as pd
import topbox
import sklearn
from sklearn.feature_extraction import DictVectorizer
import sklearn.feature_extraction.text
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn import metrics

%matplotlib inline
%run plot_learning_curve

# Load Dataset

In [None]:
import pandas as pd
 
import nltk
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')
# The files are coded in ISO-8859-1

df = pd.read_csv("tweetsCSV/Esp/Theme/themeTrain.csv", encoding='utf-8', delimiter=",", header=0)

df


In [None]:
# Encode categorical variables

df.loc[df["Themes"]=="a","Themes"] = 0
df.loc[df["Themes"]=="b","Themes"] = 1
df.loc[df["Themes"]=="c","Themes"] = 2
df.loc[df["Themes"]=="d","Themes"] = 3
df.loc[df["Themes"]=="e","Themes"] = 4

df['Themes'] = df['Themes'].astype(np.int64)

df.dtypes
df.head()

In [None]:
# Define X and Y
X = df['text'].values
y = df['Themes'].values

# Lexical feature

In [None]:
# Sample of statistics using nltk
# Another option is defining a function and pass it as a parameter to FunctionTransformer

from sklearn.base import BaseEstimator, TransformerMixin
from nltk.tokenize import sent_tokenize, word_tokenize

class LexicalStats (BaseEstimator, TransformerMixin):
    """Extract lexical features from each document"""
    
    def number_sentences(self, doc):
        sentences = sent_tokenize(doc, language='spanish')
        return len(sentences)

    def fit(self, x, y=None):
        return self

    def transform(self, docs):
        return [{'length': len(doc),
                 'num_sentences': self.number_sentences(doc)}
                for doc in docs]

    

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from nltk.stem import PorterStemmer
from nltk import word_tokenize
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
import re
import string

def custom_tokenizer(words):
    """Preprocessing tokens as seen in the lexical notebook"""
    

    urls = re.compile(r'.http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    ht = re.compile(r'http.')
    bar = re.compile(r'//*')
    pr = ["rt","@","http","https","'s",'...', 'english', 'translation','):', '. .', '..']
    #tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
    tknzr = TweetTokenizer(strip_handles=False, reduce_len=True)
    #tokens = word_tokenize(words.lower())
    tokens = tknzr.tokenize(words.lower())
    porter = PorterStemmer()
    lemmas = [porter.stem(t) for t in tokens]
    # Clean stop-words
    stoplist = stopwords.words('spanish')
    lemmas_clean = [w for w in lemmas if w not in stoplist]
    # Clean punctuation
    punctuation = set(string.punctuation)
    lemmas_punct = [w for w in lemmas_clean if w=='?'or w not in punctuation]
    # Clean emojis,urls,bars,etc
    lemmas_clean = [w for w in lemmas_punct if w!="insomnio" if not w.startswith('@') if w not in pr 
            if not bar.search(w) if not ht.search(w)
            if not w.isdigit()]
    
    return lemmas_clean

# Syntatic features

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from nltk import pos_tag
from collections import Counter 

class PosStats(BaseEstimator, TransformerMixin):
    """Obtain number of tokens with POS categories"""

    def stats(self, doc):
        tokens = custom_tokenizer(doc)
        tagged = pos_tag(doc, tagset='universal')
        counts = Counter(tag for word,tag in tagged)
        total = sum(counts.values())
        #copy tags so that we return always the same number of features
        pos_features = {'NOUN': 0, 'ADJ': 0, 'VERB': 0, 'ADV': 0, 'CONJ': 0, 
                        'ADP': 0, 'PRON':0, 'NUM': 0}
        
        pos_dic = dict((tag, float(count)/total) for tag,count in counts.items())
        for k in pos_dic:
            if k in pos_features:
                pos_features[k] = pos_dic[k]
        return pos_features
    
    def transform(self, docs, y=None):
        return [self.stats(tweet) for tweet in docs]
    
    def fit(self, docs, y=None):
        """Returns `self` unless something different happens in train and test"""
        return self

# Feature Extraction Pipelines

In [None]:
from sklearn.pipeline import Pipeline, FeatureUnion


ngrams_featurizer = Pipeline([
  ('count_vectorizer',  CountVectorizer(ngram_range = (1, 3), encoding = 'utf-8', 
                                        tokenizer=custom_tokenizer)),
  ('tfidf_transformer', TfidfTransformer())
])

In [None]:
class TopicTopWords(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self
    
    def loadWords(self, file):
        df=pd.read_csv(file, encoding='utf-8', delimiter=",", header=0)
        df=df.set_index("words")
        dic=[]
        dic=dict((word,prob.values[0]) for word,prob in df.iterrows())
    
        return dic
    
    def topWords(self, tweet, resultlist):
        topWords_a=self.loadWords("tweetsCSV/Esp/Theme/topWords_a.csv")
        topWords_b=self.loadWords("tweetsCSV/Esp/Theme/topWords_b.csv")
        topWords_c=self.loadWords("tweetsCSV/Esp/Theme/topWords_c.csv")
        topWords_d=self.loadWords("tweetsCSV/Esp/Theme/topWords_d.csv")
        topWords_e=self.loadWords("tweetsCSV/Esp/Theme/topWords_e.csv")
        #allhtdict = dict((ht, 0) for ht in listallht)
        #sent = tknzrwhu.tokenize(str(tweet))       
        prob_a=0
        prob_b=0
        prob_c=0
        prob_d=0
        prob_e=0
        for term in tweet:
            if term in topWords_a.keys():
                prob_a+=topWords_a.get(term)
            if term in topWords_b.keys():
                prob_b+=topWords_b.get(term)
            if term in topWords_c.keys():
                prob_c+=topWords_c.get(term)
            if term in topWords_d.keys():
                prob_d+=topWords_d.get(term)
            if term in topWords_e.keys():
                prob_e+=topWords_e.get(term)
        theme_dict={"a":prob_a,"b":prob_b,"c":prob_c,"d":prob_d,"e":prob_e}
        #print(tweet)
        #print(theme_dict)
        resultlist.append(theme_dict)
        
        return (resultlist)


    def transform(self, data):
        #print("Entra en hashtags")
        #dataproc = preProcess(data)
        #lista = []
        listaresultado = []
        for tweet in data:
            tweet_processed=custom_tokenizer(tweet)
        #for tweet in dataproc:
        #    lista = self.allHashtags(tweet, lista)
        #for tweet in dataproc:
        #    listaresultado = self.Hashtags(tweet, lista, listaresultado)
            listaresultado=self.topWords(tweet_processed,listaresultado)
        #print(listaresultado)
        return listaresultado

# Feature Union Pipeline

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.cross_validation import cross_val_score, KFold
from sklearn.metrics import classification_report
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.decomposition import NMF, LatentDirichletAllocation

from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, make_scorer

def pipeline(clf):
    return Pipeline([
       ('features', FeatureUnion([
                    ('lexical_stats', Pipeline([
                                ('stats', LexicalStats()),
                                ('vectors', DictVectorizer())
                            ])),
                    ('words', TfidfVectorizer(tokenizer=custom_tokenizer)),
                    ('ngrams', ngrams_featurizer),
                    # Topics of the Docs
                    ('label-lda', Pipeline([
                                ('topWords', TopicTopWords()),
                                ('vect', DictVectorizer())
                            ])),
                ])),
        # Machine Learning
        ('clf', clf)  # classifier
        #('clf', SVC(gamma= 3, kernel='linear', probability=True))

    ])

def classification_report_with_accuracy_score(y_true, y_pred):

    print (classification_report(y_true, y_pred)) # print classification report
    return accuracy_score(y_true, y_pred) # return accuracy score

cv = KFold(X.shape[0], n_folds=5, shuffle=False, random_state=33)


## Train, Optimize and Evaluate models


### MultinomialNB

In [None]:
from sklearn.model_selection import GridSearchCV
#Optimize multinomialNB
pipelineNB = pipeline(MultinomialNB(alpha=.001))

parametersNB = {'clf__alpha': [.0001,.001,.01,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]}

gs_NB = GridSearchCV(pipelineNB, parametersNB, n_jobs=-1)

In [None]:
gs_NB = gs_NB.fit(X, y)

In [None]:
print("Best Score with MultinomialNB: %s" % gs_NB.best_score_)
for param_name in sorted(parametersNB.keys()):
    print("%s: %r" % (param_name, gs_NB.best_params_[param_name]))

In [None]:
#Evaluate with K-Fold
model_NB = pipeline(MultinomialNB(alpha=gs_NB.best_params_['clf__alpha'] ))
scores = cross_val_score(model_NB, X, y, cv=cv)
print("Scores in every iteration", scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))


In [None]:
plot_learning_curve(model_NB, "Learning curve with K-Fold", X, y, cv=cv)

In [None]:
# Nested CV with parameter optimization
nested_score = cross_val_score(model_NB, X, y, cv=cv, \
               scoring=make_scorer(classification_report_with_accuracy_score))
print (nested_score) 

### SVC

In [None]:
from sklearn.model_selection import GridSearchCV

#Optimize SVC
pipelineSVC = pipeline(SVC(C=1,gamma= 3, kernel='linear', probability=True))

parametersSVC = {'clf__C':range(1,15),'clf__gamma': np.logspace(-6, -1, 10), 'clf__kernel': ('linear','rbf'),
                 'clf__probability':(True,False),}

gs_SVC = GridSearchCV(pipelineSVC, parametersSVC, n_jobs=-1)

In [None]:
gs_SVC= gs_SVC.fit(X,y)


In [None]:
print("Best Score with SVC: %s" % gs_SVC.best_score_)
for param_name in sorted(parametersSVC.keys()):
    print("%s: %r" % (param_name, gs_SVC.best_params_[param_name]))

In [None]:
#Evaluate with K-Fold
C_SVC=gs_SVC.best_params_['clf__C']
gamma_SVC = gs_SVC.best_params_['clf__gamma']
kernel_SVC = gs_SVC.best_params_['clf__kernel']
probability_SVC = gs_SVC.best_params_['clf__probability']
model_SVC = pipeline(SVC(C=C_SVC,gamma=gamma_SVC, kernel=kernel_SVC, probability=probability_SVC))
scores = cross_val_score(model_SVC, X, y, cv=cv)
print("Scores in every iteration", scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [None]:
plot_learning_curve(model_SVC, "Learning curve with K-Fold", X, y, cv=cv)

In [None]:
# Nested CV with parameter optimization
nested_score = cross_val_score(model_SVC, X, y, cv=cv, \
               scoring=make_scorer(classification_report_with_accuracy_score))
print (nested_score)

### KNeighbourClassifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier

#Optimize KNeighborsClassifier
pipelineKN = pipeline(KNeighborsClassifier(n_neighbors=3)) 
parametersKN = {'clf__n_neighbors': range(1,25), 'clf__p':(1,2)}

gs_KN = GridSearchCV(pipelineKN, parametersKN, n_jobs=-1)

In [None]:
gs_KN= gs_KN.fit(X,y)

In [None]:
print("Best Score with KN: %s" % gs_KN.best_score_)
for param_name in sorted(parametersKN.keys()):
    print("%s: %r" % (param_name, gs_KN.best_params_[param_name]))

In [None]:
#Evaluate with K-Fold
model_KN = pipeline(KNeighborsClassifier(gs_KN.best_params_['clf__n_neighbors'],p=gs_KN.best_params_['clf__p']))
scores = cross_val_score(model_KN, X, y, cv=cv)
print("Scores in every iteration", scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [None]:
# Nested CV with parameter optimization
nested_score = cross_val_score(model_KN, X, y, cv=cv, \
               scoring=make_scorer(classification_report_with_accuracy_score))
print (nested_score)

### LogisticRegression


In [None]:
from sklearn.linear_model import LogisticRegression
#Optimize LogisticRegresion
pipelineLR =  pipeline(LogisticRegression(penalty='l2',tol=0.0001,C=1.0,n_jobs=-1)) 
parametersLR = {'clf__penalty': ['l1','l2'], 'clf__tol': [0.0001,0.001,0.01,0.1], 'clf__C': range(1,15)}

gs_LR = GridSearchCV(pipelineLR, parametersLR, n_jobs=-1)

In [None]:
gs_LR= gs_LR.fit(X,y)

In [None]:
print("Best Score with LogisticRegression: %s" % gs_LR.best_score_)
for param_name in sorted(parametersLR.keys()):
    print("%s: %r" % (param_name, gs_LR.best_params_[param_name]))

In [None]:
#Evaluate with K-Fold
#penalty_LR=gs_LR.best_params_['clf__penalty']
#tol_LR = gs_LR.best_params_['clf__tol']
#C_LR = gs_LR.best_params_['clf__C']
#model_LR = pipeline(LogisticRegression(penalty=penalty_LR,tol=tol_LR,C=C_LR,n_jobs=-1))
model_LR = pipeline(LogisticRegression(penalty='l2',tol=0.01,C=14,n_jobs=-1))
scores = cross_val_score(model_LR, X,y, cv=cv)
print("Scores in every iteration", scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [None]:
plot_learning_curve(model_LR, "Learning curve with K-Fold", X, y, cv=cv)

In [None]:
# Nested CV with parameter optimization
nested_score = cross_val_score(model_LR, X, y, cv=cv, \
               scoring=make_scorer(classification_report_with_accuracy_score))
print (nested_score) 

In [None]:
#Train classifier
model_LR.fit(X,y)

In [None]:


extracted_features = model_LR.named_steps['features'].transform(X)



In [None]:
extracted_features

In [None]:
import scipy.sparse

scipy.sparse.save_npz('./extracted_features_themes.npz',extracted_features)

In [None]:
y=model_LR.predict(["Mi tratamiento contra el insomnio es no dormir siesta"])

In [None]:
y[0]

### RandomForests

In [None]:

from sklearn.ensemble import RandomForestClassifier

#Optimize RandomForests
pipelineRF = pipeline(RandomForestClassifier(n_estimators=10,n_jobs=-1))

parametersRF = {'clf__n_estimators': range (1,25)}

gs_RF = GridSearchCV(pipelineRF, parametersRF, n_jobs=-1)

In [None]:
gs_RF= gs_RF.fit(X,y)

In [None]:
print("Best Score with RandomForests: %s" % gs_RF.best_score_)
for param_name in sorted(parametersRF.keys()):
    print("%s: %r" % (param_name, gs_RF.best_params_[param_name]))

In [None]:
#Evaluate with K-Fold
estimators_RF=gs_RF.best_params_['clf__n_estimators']

model_RF = pipeline(RandomForestClassifier(n_estimators=estimators_RF,n_jobs=-1))
scores = cross_val_score(model_RF, X,y, cv=cv)
print("Scores in every iteration", scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [None]:
# Nested CV with parameter optimization
nested_score = cross_val_score(model_RF, X, y, cv=cv, \
               scoring=make_scorer(classification_report_with_accuracy_score))
print (nested_score) 
        