In [1]:
from sklearn.datasets import load_files 

import numpy as np
import pandas as pd
import nltk
import re
import os
import codecs
from sklearn import feature_extraction
import mpld3

from __future__ import print_function

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.externals import joblib
from sklearn.cluster import KMeans


# Descripción del set de datos

Contamos con un set de datos extraídos de twitter con información de TI en las areas de Big Data. CRM y Bases de Datos. 

Los tweets son a nivel global, y hemos separados los de idioma inglés.

Queremos encontrar las relaciones de los textos entre los diferentes países y las tecnologías descriptas, describir las tecnologías por las temáticas o hashtags más importantes, cuales son los principales usuarios.

Por ejemplo: que se dice alrededor de la palabra crm, en que países, cuales son las tecnologías y marcas más mencionadas al rededor de esta palabra, cuales y de que país son los usuarios más influyentes.


In [2]:
dateparse = lambda x: pd.to_datetime(x, utc=False, format='%Y-%m-%d',errors='coerce')

dataset = pd.read_csv('datos_kunan_ok.csv', sep=';', 
    encoding='utf-8', parse_dates=['created_at'], date_parser=dateparse)

In [3]:
dataset.describe(include=['object'])


Unnamed: 0,user_name,user_screen_name,text,retweeted_status_text,entities_hashtags_0_text,entities_hashtags_1_text,entities_hashtags_2_text,entities_hashtags_3_text,lang,user_location,place_country,place_country_code,place_full_name,place_id,place_name,place_place_type
count,2000,2000,2000,351,1207,950,733,553,2000,1885,1488,1488,1488,1488,1488,1488
unique,876,877,1940,329,510,398,308,232,1,383,35,17,156,128,150,1
top,a9a7ef7b233aca755affeb84c12f8dc3,30082b1cbbd3bf9752446e2796ac12b4,"Elastic search, multiple indexes vs one index ...",.@Oracle has big plans for #India: Safra Catz ...,BigData,job,Shanghai,Job,en,"bengaluru, india",United Arab Emirates,AE,"Dubai, United Arab Emirates",001907e868d06e24,Dubai,admin
freq,168,168,4,4,67,40,34,27,2000,512,431,453,348,368,349,1488


In [4]:
# preparar funciones de procesamiento de texto
def tokenize_and_stem(text):
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

In [5]:
# e.g.:
tokenize_and_stem('cats are running')

['cat', 'are', 'run']

**Texto de prueba**

In [6]:
text=dataset.text.values

In [7]:
text.shape

(2000,)

**Separar en palabras y usar las raíces de los vocablos**

In [8]:
palabras = []

for i in text:
    p = tokenize_and_stem(i)
    palabras.extend(p)

**Limpiar las "palabras vacias"**

In [23]:
# lista de "stopwords"
from nltk.corpus import stopwords
stopwords = nltk.corpus.stopwords.words('english')

stopwords.append('https')
stopwords.append('rt')

print(stopwords)


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [24]:
f_text = [word for word in palabras if word not in stopwords]
print(f_text)

['asm', 'tour', 'around', 'kingdom', 'mainten', 'contract', 'flour', 'mill', 'region', 'check', 'plc…', '//t.co/86i0lev9kv', 'float', 'oracl', 'middl', 'east', '//t.co/r3qa4inead', 'amba17', 'k2partner', 'salesforceu', 'great', 'event', '//t.co/mf2fkaadrz', 'right', 'cousin', '//t.co/nomvojgo7x', 'bigdata', 'onli', 'buzzword', 'bigdata', 'data', 'arabnetdubai', 'arabnet', 'arabnetm', 'arabnetm', '//t.co/zeisbigjkt', 'lucki', '//t.co/svned3ci0u', 'nearbi', 'love', 'old', 'hous', 'oracl', 'oracl', 'dubai', '//t.co/gtszgnw36l', 'claaiireeyyy', 'jm_galario', 'anneyeong', 'bessi', '//t.co/cg6nfdmdir', 'solid', 'blue', 'poor', 'perform', 'baggi', 'coyb', 'efc', 'danni', 'devito', 'treasur', '//t.co/ang3hb2akc', 'acsabudhabi', 'student', 'stand', 'human', 'traffick', 'amp', 'modern-day', 'slaveri', 'w/', 'power', 'perform', 'on…', '//t.co/kp5roczag', 'saw', 'iliad', 'doubl', 'tripl', 'wow', 'amaz', 'perform', '//t.co/fywjlmxfgk', 'get', 'good', 'respons', 'data', 'alway', 'good', 'pleas', 'pa

**Crear una tabla de palabras**

In [25]:
vocab_frame = pd.DataFrame({'words': f_text}, index = range(len(f_text)))
print(vocab_frame[0:10])

      words
0       asm
1      tour
2    around
3   kingdom
4   mainten
5  contract
6     flour
7      mill
8    region
9     check


In [26]:
# Compute the term frequency-inverse document frequency matrix
tfidf_vectorizer = TfidfVectorizer(binary=True, max_df=0.95,
                                 min_df=20, stop_words=stopwords,
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,5))

tfidf_matrix = tfidf_vectorizer.fit_transform(text)
print("La matrix tiene %i filas (documentos) y %i columnas (palabras)\n" % tfidf_matrix.shape)

La matrix tiene 2000 filas (documentos) y 228 columnas (palabras)



In [27]:
terms = tfidf_vectorizer.get_feature_names()
print("Hay en total %i palabras:\n" % len(terms))
print(terms)

Hay en total 228 palabras:

["'m", "'s", 'account', 'ai', 'amp', 'analyt', 'analyt innovation…', 'analyt leadership', 'analyt manag', 'analyt manag shanghai', 'analytics…', 'anyon', 'anyon job', 'app', 'appli', 'audit', 'audit analyt', 'audit analyt manag', 'audit analyt manag shanghai', 'b3', 'b3 disrupt', 'b3 disrupt b3', 'best', 'big', 'big data', 'bigdata', 'bigdata analyt', 'bigdata iot', 'bigdata…', 'blockchain', 'busi', 'carpet', 'carpet manufacturer/…', 'ceo', 'chang', 'clear', 'clear disrupt', 'clear disrupt b3', 'clear disrupt b3 disrupt', 'clear disrupt b3 disrupt b3', 'click', 'click appli', 'cloud', 'cmo', 'come', 'compani', 'crm', 'crt/crx/cb/crm/sacking/hessian', 'crt/crx/cb/crm/sacking/hessian jute', 'crt/crx/cb/crm/sacking/hessian jute yarn', 'crt/crx/cb/crm/sacking/hessian jute yarn world', 'crt/crx/cb/crm/sacking/hessian jute yarn world wide', 'custom', 'data', 'data analyt', 'databas', 'datasci', 'datascientist', 'datascientist saa', 'datascientist saa analyt', 'dat

## Buscando grupos de documentos con K-Means

In [28]:
num_clusters = 10
km = KMeans(n_clusters=num_clusters)
km.fit(tfidf_matrix)
clusters = km.labels_.tolist()

In [29]:
# Recuento del número de elementos en cada cluster
for i in range(num_clusters):
    print ('El cluster %i tiene %i elementos' % (i, clusters.count(i)))

El cluster 0 tiene 498 elementos
El cluster 1 tiene 40 elementos
El cluster 2 tiene 264 elementos
El cluster 3 tiene 167 elementos
El cluster 4 tiene 245 elementos
El cluster 5 tiene 122 elementos
El cluster 6 tiene 183 elementos
El cluster 7 tiene 140 elementos
El cluster 8 tiene 109 elementos
El cluster 9 tiene 232 elementos


In [30]:
dist = 1 - cosine_similarity(tfidf_matrix)

In [31]:
print("Top terms per cluster:")
print()
#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1]     
        
for i in range(num_clusters):
    print("[[ Cluster %d ]]" % i, end='\n\n')
    
    print("   WORDS /// ", end='')
    
    for ind in order_centroids[i, :6]: #replace 6 with n words per cluster
        print(terms[ind], end=' / ')
    print('\n\n')


Top terms per cluster:

[[ Cluster 0 ]]

   WORDS /// salesforc / crm / amp / use / salesforceblog / new / 


[[ Cluster 1 ]]

   WORDS /// n't / perform / 's / get / doe / one / 


[[ Cluster 2 ]]

   WORDS /// analyt / market / entrepreneur / director / datascientist / saa / 


[[ Cluster 3 ]]

   WORDS /// yarn world wide carpet manufacturer/… / carpet / manufacturer/… / jute yarn world wide carpet / jute yarn world wide / wide / 


[[ Cluster 4 ]]

   WORDS /// bigdata / iot / ai / machinelearn / datasci / fintech / 


[[ Cluster 5 ]]

   WORDS /// oracl / cloud / india / amp / help / thank / 


[[ Cluster 6 ]]

   WORDS /// databas / job / hire / open / click / click appli / 


[[ Cluster 7 ]]

   WORDS /// data / data analyt / analyt / big data / big / bigdata / 


[[ Cluster 8 ]]

   WORDS /// 's / perform / oracl / today / amp / last / 


[[ Cluster 9 ]]

   WORDS /// perform / music / amp / love / today / great / 




In [32]:
dataset['clusters'] = clusters

In [33]:
print("Top terms per cluster:")
print()
#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1]     
        
for i in range(num_clusters):
    print("*** Cluster %d:" % i, end='\n\n')
    
    print("WORDS /// ", end='')
    
    for ind in order_centroids[i, :6]: #replace 6 with n words per cluster
        print(terms[ind], end=' / ')
   
    # Imprime los paises y usuarios principales
    print('\n') #add whitespace
    print('Cluster por País y Usuarios') #add whitespace
    print(dataset[dataset['clusters']==i].groupby(['place_country','user_name']).count().sort_values(['id'],ascending=False)[:]['id'])

    print('\n') #add whitespace
    print('Cluster por hashtags ') #add whitespace
    print(dataset[dataset['clusters']==i].groupby(['entities_hashtags_0_text']).count().sort_values(['id'],ascending=False)[:]['id'])
    print ('El cluster %i tiene %i elementos' % (i, clusters.count(i)))
    
    
    print(' ') #add whitespace
    print(' ') #add whitespace

Top terms per cluster:

*** Cluster 0:

WORDS /// salesforc / crm / amp / use / salesforceblog / new / 

Cluster por País y Usuarios
place_country                  user_name                       
People's Republic of China     9dedaa716137e68b13461ab1673fe8bc    9
                               a5f1b8c67aaaa80d32c316255fc2ab3d    8
                               63e69048c5ade464c8022dc1137a9df1    7
United Arab Emirates           c5ee23e54567836b4b014da43c51f46c    7
                               7663ab23298f910b622a3d41e9369e5d    6
                               4b4c5bba861617aef77f6e739369d972    6
People's Republic of China     6d88accca0cace0a65e6212a71bbb607    5
Canada                         81abcea6e16bc538d9843e8808b2066b    5
Switzerland                    9c46b9290f4bd7ccfc7113a67ce219e9    5
                               4956b848ab3c9cad4273461fa22221e5    4
Germany                        42d46b3b2631f2ad71903eab586762d5    3
Cameroon                       d85aaf8083392

## Conclusión hasta el momento
Hemos encontrado mediante las técnicas de clustering describir más los datos:
Cluster 0: Salesforce en paises como China y Emiratos Arabes, usuarios influyentes: 9dedaa716137e68b13461ab1673fe8bc y a5f1b8c67aaaa80d32c316255fc2ab3d

Cluster 1: no nos dice mucho

Cluster 2: habla de los puestos en tecnología, analistas, marketing, emprendedores, directores, datascientist en Argentina de un usuario influyente 0772b359f6fe2426c015cd79cd5cf079 con 128 tweets

Cluster 3: habla de yarn en Bangladesh y 1 solo usuario

Cluster 4: big data, ml, iot en Emiratos Arabes, Argfentina y Australia

y hasta aquí llegamos



