**Important:** First run the mongo data transfer with command:
`script/dumping_database.sh` 
since selenium-scraper folder

In [1]:
import os
HOST_MONGO = os.environ["SOCIALMONGO"]
MONGO_DB = os.environ["SOCIALMONGODB"]
print("HOST MONGO: ",HOST_MONGO)
print("DATA BASE:",MONGO_DB)
# In production DATA BASE should be contents

HOST MONGO:  mongodb://mongo:27017
DATA BASE: contents


## Importing mongo Collection

In [2]:
from pymongo import MongoClient
client = MongoClient(HOST_MONGO)
db = client[MONGO_DB]
posts = db['labeled_posts']
dictionaries = db['thematic_dictionaries']
print('Number of posts',posts.count())
print('Number of categories',dictionaries.count())

Number of posts 41231
Number of categories 2623


## Creating a DataFrames

In [3]:
import pandas as pd
df_posts = pd.DataFrame(list(posts.find()))

In [4]:
##### import numpy as np
df_dictionary = pd.DataFrame(list(dictionaries.find()))
thematics = set(df_dictionary["thematic"])
thematic_to_number = dict(zip(thematics,range(len(thematics))))
number_to_thematic = dict(zip(thematic_to_number.values(),thematic_to_number.keys()))
#thematic_to_number
dict_words_to_numeric = dict(zip(df_dictionary["word"],df_dictionary["thematic"].map(lambda x: thematic_to_number[x])))
df_posts["thematic_id"] = df_posts["thematic"].map(lambda x: dict_words_to_numeric[x])
df_posts

Unnamed: 0,_id,image,network_type,text,thematic,thematic_id
0,5a15c6ca6f8c0fd1d797231b,https://scontent-iad3-1.cdninstagram.com/t51.2...,Instagram,#Frases FrasesDeHoy \n#libros #literatura #esc...,Escritoras,5
1,5a15c6ca6f8c0fd1d7972325,https://scontent-iad3-1.cdninstagram.com/t51.2...,Instagram,Simplemente una mujer agradecida De Dios por t...,Escritoras,5
2,5a15c6cb6f8c0fd1d7972330,https://pbs.twimg.com/media/DPPZliRWsAEtTsi.jpg,Twitter,Accurate time keeping tools can reduce payroll...,businesstips,9
3,5a15c6cb6f8c0fd1d7972332,https://pbs.twimg.com/media/DPP1UHFV4AEMLnn.jpg,Twitter,A better way to retire & travel the world your...,businesstips,9
4,5a15c6cc6f8c0fd1d7972335,https://pbs.twimg.com/media/DOweKxwUQAA2zwq.jpg,Twitter,8 SEO Myths to Leave Behind in 2017 · Web It 1...,businesstips,9
5,5a15c6cc6f8c0fd1d7972338,https://pbs.twimg.com/ext_tw_video_thumb/93242...,Twitter,F$%K WHAT THEY THINK\n#entrepreneur #motivatio...,businesstips,9
6,5a15c6cc6f8c0fd1d797233a,https://pbs.twimg.com/media/DPOujIzUMAATTbQ.jpg,Twitter,Reposting @thethreshold:\nThank you to @motani...,businesstips,9
7,5a15c6cd6f8c0fd1d797233d,,Twitter,Let’s face it: Many potential customers expect...,businesstips,9
8,5a15c6cd6f8c0fd1d7972344,https://pbs.twimg.com/media/DPQf330XkAEb4VW.png,Twitter,Using blogs to market your small business. Wat...,businesstips,9
9,5a15c6cd6f8c0fd1d7972348,https://pbs.twimg.com/media/DPQZ23rWAAAu_pD.jpg,Twitter,What Does Hot Buttered Rum Have to do With Bus...,businesstips,9


In [5]:
thematic_to_number

{'Art and Entertainment': 5,
 'Business': 9,
 'Fashion': 2,
 'Food': 7,
 'Lifestyle': 1,
 'Music': 4,
 'News and Politics': 0,
 'Science': 3,
 'Sports': 6,
 'Technology': 8}

In [6]:
number_to_thematic

{0: 'News and Politics',
 1: 'Lifestyle',
 2: 'Fashion',
 3: 'Science',
 4: 'Music',
 5: 'Art and Entertainment',
 6: 'Sports',
 7: 'Food',
 8: 'Technology',
 9: 'Business'}

In [7]:
from sklearn.model_selection import train_test_split

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df_posts['text'], 
                                                    df_posts['thematic_id'], 
                                                    random_state=0)

## Vectorizing

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# CHOOSE ONE OF THE FOLLOWING VECTORIZERS:

# Fit the CountVectorizer to the training data
vect = CountVectorizer().fit(X_train)

# Fit the TfidfVectorizer to the training data specifiying a minimum document frequency of 5
#vect = TfidfVectorizer(min_df=5).fit(X_train)

# Fit the CountVectorizer to the training data specifiying a minimum document frequency of 5 and extracting 1-grams and 2-grams
#vect = CountVectorizer(min_df=5, ngram_range=(1,2)).fit(X_train)

# DIMENSION WORDS LIST:
#vect.get_feature_names()[::2000]
# AMOUNT OF WORDS:
#len(vect.get_feature_names())

# transform the documents in the training data to a document-term matrix
X_train_vectorized = vect.transform(X_train)

## Trainning a Model

In [9]:
from sklearn.linear_model import LogisticRegression
# Train the model
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

## Model Scores

### General Score in train

In [10]:
train_score = model.score(X_train_vectorized,y_train)
print("Score in train:",train_score)

Score in train: 0.978042233936


### General Score in test

In [11]:
X_test_vectorized = vect.transform(X_test)
y_test_est = model.predict(X_test_vectorized)
model.score(X_test_vectorized,y_test)

0.86777260380287158

### Confusion Matrix

In [19]:
y_test

8562     5
34080    9
7961     5
34051    5
2839     5
8422     5
22638    5
30412    9
23021    5
16903    5
5420     2
12294    9
4430     2
16614    5
28295    9
322      5
16200    5
22181    5
26515    5
30750    9
14127    9
28392    9
36438    2
2663     5
20292    2
25717    5
22703    5
39360    2
7482     2
19186    9
        ..
9741     9
39402    2
232      5
22625    5
39599    2
21791    5
37583    9
6613     2
25716    5
27152    5
37234    9
11046    5
14750    5
17707    9
5575     2
23405    5
8255     5
10714    5
37835    9
27813    9
6693     2
26125    5
2253     5
15609    5
40806    2
12919    5
32631    2
31615    9
11553    5
3580     9
Name: thematic_id, Length: 10308, dtype: int64

In [17]:
from sklearn.metrics import confusion_matrix
c = confusion_matrix(y_test, y_test_est)
cantidad = c.sum(axis=1)
diagonal = c.diagonal()
accuracy = diagonal/cantidad
accuracy

array([ 0.47014925,  0.77000503,  0.92554992,  0.06666667,  0.86382114])

In [13]:
c

array([[  63,    5,   16,    0,   50],
       [   2, 1530,  297,    0,  158],
       [   4,   96, 4376,    0,  252],
       [   0,    0,    9,    1,    5],
       [  14,   88,  366,    1, 2975]])

## Using Model

In [14]:
thematic_to_number

{'fashion': 3, 'fitness': 1, 'food': 0, 'futbol': 4, 'makeup': 5, 'music': 2}

In [59]:
post = "Participaré como speaker en el #FOM17LA, junto con @AlexaOlavarria para hablar sobre Influencer Marketing."
posts = ["Hurry, only a few more days left to make the most of Kids Eat Free at all @JamiesItalianUK. Avail until 29/10, book: http://jamieol.com/KEFOct29"
,"Entire oranges in our @drinkrobinsons orange squash? Pith 'un all? 😮🍊 @jimmysfarm investigates on #FoodUnwrapped TONIGHT - 8pm @Channel4"]
def thematicsClassifier(posts):
    if type(posts) == str: 
        posts = [posts]
    labels = list(map(lambda x: number_to_thematic[x],model.predict(vect.transform(posts))))
    probs = list(map(lambda x: max(x),model.predict_proba(vect.transform(posts))))
    return [labels,probs]

output = thematicsClassifier(posts) #change it for posts
print("Output:",output)

Output: [['food', 'music'], [0.90092639297283827, 0.28349886923603657]]


## Dumping Model

In [16]:
import shelve
with shelve.open('models/thematics_classifier') as shelve:
    shelve.clear()
    shelve['vect'] = vect
    shelve['model'] = model
    shelve['number_to_thematic'] = number_to_thematic