**Important:** First run the mongo data transfer with command:
`script/dumping_database.sh` 
since selenium-scraper folder

In [39]:
import os
HOST_MONGO = os.environ["SOCIALMONGO"]
MONGO_DB = os.environ["SOCIALMONGODB"]
print("HOST MONGO: ",HOST_MONGO)
print("DATA BASE:",MONGO_DB)
# In production DATA BASE should be contents

HOST MONGO:  mongodb://mongo:27017
DATA BASE: contents


## Importing mongo Collection

In [40]:
from pymongo import MongoClient
client = MongoClient(HOST_MONGO)
db = client[MONGO_DB]
posts = db['labeled_posts']
dictionaries = db['thematic_dictionaries']
print('Number of posts',posts.count())
print('Number of categories',dictionaries.count())

Number of posts 40588
Number of categories 180


## Creating a DataFrames

In [41]:
import pandas as pd
df_posts = pd.DataFrame(list(posts.find()))

In [66]:
import numpy as np
df_dictionary = pd.DataFrame(list(dictionaries.find()))
thematics = set(df_dictionary["thematic"])
thematic_to_number = dict(zip(thematics,range(len(thematics))))
number_to_thematic = dict(zip(thematic_to_number.values(),thematic_to_number.keys()))
#thematic_to_number
dict_words_to_numeric = dict(zip(df_dictionary["word"],df_dictionary["thematic"].map(lambda x: thematic_to_number[x])))
df_posts["thematic_id"] = df_posts["thematic"].map(lambda x: dict_words_to_numeric[x])
df_posts

Unnamed: 0,_id,image,network_type,text,thematic,thematic_id
0,59dd2801c25d306e2e4b1ebc,,Twitter,"The most difficult team to beat in football, i...",football,5
1,59dd2801c25d306e2e4b1ebf,https://pbs.twimg.com/media/DLy93UHWkAAbrMk.jpg,Twitter,Good luck to Argentina & Portugal tonight. \...,football,5
2,59dd2801c25d306e2e4b1ec2,,Twitter,"Premier league better , been performing amazin...",futbol,5
3,59dd2801c25d306e2e4b1ec4,https://pbs.twimg.com/media/DLzU7fGW0AAjjtw.jpg,Twitter,Andre Silva has been involved in 10 goals in h...,football,5
4,59dd2801c25d306e2e4b1ec6,,Twitter,"For me... hard times or reining days, or dark ...",futbol,5
5,59dd2801c25d306e2e4b1ec9,https://pbs.twimg.com/media/DLwCLAwUQAAC1rD.jpg,Twitter,It's Judgment Day for Messi and Argentina.,soccer,5
6,59dd2801c25d306e2e4b1ecb,https://pbs.twimg.com/ext_tw_video_thumb/91767...,Twitter,The quality on show in Australia vs Syria was ...,football,5
7,59dd2801c25d306e2e4b1ecd,,Twitter,The French national team is like a wonderful s...,soccer,5
8,59dd2801c25d306e2e4b1ecf,https://pbs.twimg.com/media/DLzIKExWAAADDqq.jpg,Twitter,Eden Hazard has now been directly involved in ...,football,5
9,59dd2801c25d306e2e4b1ed1,https://pbs.twimg.com/media/DLzULdmV4AAdIqc.jpg,Twitter,ANDRE SILVA MAKES IT TWO! Portugal look well o...,look,4


In [67]:
thematic_to_number

{'fashion': 4, 'fitness': 3, 'food': 2, 'futbol': 5, 'makeup': 0, 'music': 1}

In [68]:
number_to_thematic

{0: 'makeup', 1: 'music', 2: 'food', 3: 'fitness', 4: 'fashion', 5: 'futbol'}

In [44]:
from sklearn.model_selection import train_test_split

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df_posts['text'], 
                                                    df_posts['thematic_id'], 
                                                    random_state=0)

## Vectorizing

In [45]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# CHOOSE ONE OF THE FOLLOWING VECTORIZERS:

# Fit the CountVectorizer to the training data
vect = CountVectorizer().fit(X_train)

# Fit the TfidfVectorizer to the training data specifiying a minimum document frequency of 5
#vect = TfidfVectorizer(min_df=5).fit(X_train)

# Fit the CountVectorizer to the training data specifiying a minimum document frequency of 5 and extracting 1-grams and 2-grams
#vect = CountVectorizer(min_df=5, ngram_range=(1,2)).fit(X_train)

# DIMENSION WORDS LIST:
#vect.get_feature_names()[::2000]
# AMOUNT OF WORDS:
#len(vect.get_feature_names())

# transform the documents in the training data to a document-term matrix
X_train_vectorized = vect.transform(X_train)

## Trainning a Model

In [46]:
from sklearn.linear_model import LogisticRegression
# Train the model
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

## Model Scores

### General Score in train

In [47]:
train_score = model.score(X_train_vectorized,y_train)
print("Score in train:",train_score)

Score in train: 0.981636608521


### General Score in test

In [48]:
X_test_vectorized = vect.transform(X_test)
y_test_est = model.predict(X_test_vectorized)
model.score(X_test_vectorized,y_test)

0.87671232876712324

### Confusion Matrix

In [53]:
from sklearn.metrics import confusion_matrix
c = confusion_matrix(y_test, y_test_est)
cantidad = c.sum(axis=1)
diagonal = c.diagonal()
accuracy = diagonal/cantidad
accuracy

array([ 0.84690327,  0.88110403,  0.89572349,  0.8600848 ,  0.87032843,
        0.90070505])

## Using Model

In [57]:
thematic_to_number

{'fashion': 4, 'fitness': 3, 'food': 2, 'futbol': 5, 'makeup': 0, 'music': 1}

In [85]:
post = "Hurry, only a few more days left to make the most of Kids Eat Free at all @JamiesItalianUK. Avail until 29/10, book: http://jamieol.com/KEFOct29"
posts = ["Hurry, only a few more days left to make the most of Kids Eat Free at all @JamiesItalianUK. Avail until 29/10, book: http://jamieol.com/KEFOct29"
,"Hurry, only a few more days left to make the most of Kids Eat Free at all @JamiesItalianUK. Avail until 29/10, book: http://jamieol.com/KEFOct29"]
output = list(map(lambda x: number_to_thematic[x],model.predict(vect.transform(pd.Series(post))))) #change it for posts
print("Output:",output)

Output: ['food']


## Dumping Model