In [210]:
# Requires pymongo 3.6.0+
from pymongo import MongoClient
from bson.code import Code
import pandas as pd

client = MongoClient("mongodb://bigdata-mongodb-04.virtual.uniandes.edu.co:8087/", retryWrites=False)
database = client["Grupo03"]
collection = database["COL_dataset"]

In [211]:
# Created with Studio 3T, the IDE for MongoDB - https://studio3t.com/

query = {}
projection = {}
projection["reply_or_quote"] = 1.0

data = []
cursor = collection.find(query, projection = projection)
try:
    for doc in cursor:
        data.append([doc['_id'], doc['reply_or_quote']])
finally:
    client.close()

In [212]:
df = pd.DataFrame(data,columns=['_id', 'text'])
df.head()

Unnamed: 0,_id,text
0,5e95c25095e4b2cefb66fbf2,@ClaudiaLopez @infopresidencia @Bogota Para cu...
1,5e95c25095e4b2cefb66fbf3,@ClaudiaLopez @infopresidencia @Bogota La verd...
2,5e95c25095e4b2cefb66fbf4,"@ClaudiaLopez @infopresidencia @Bogota Pero, x..."
3,5e95c25095e4b2cefb66fbf5,@ClaudiaLopez @infopresidencia @Bogota Ingreso...
4,5e95c25095e4b2cefb66fbf6,@ClaudiaLopez @infopresidencia @Bogota Estimad...


In [213]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import re

In [214]:
def remove_links(tweet):
    '''Takes a string and removes web links from it'''
    tweet = re.sub(r'http\S+', '', tweet) # remove http links
    tweet = re.sub(r'bit.ly/\S+', '', tweet) # rempve bitly links
    tweet = tweet.strip('[link]') # remove [links]
    return tweet

def remove_users(tweet):
    '''Takes a string and removes retweet and @user information'''
    tweet = re.sub('(RT\s@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet) # remove retweet
    tweet = re.sub('(@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet) # remove tweeted at
    return tweet

In [222]:
my_stopwords = nltk.corpus.stopwords.words('spanish')
stopwords = ['d', 'x', 'pa', 'q', 'si', 'usted', 'tan', 'solo', 'ser', 'bien', 'así', 'mas', 'va', 'van', 'señor', 'hace', 'hacer', 
             'siempre', 'gracias', 'favor', 'puede', 'dio', 'como', 'aquí', 'ahí']
my_stopwords.extend(stopwords)
print(my_stopwords)
word_rooter = nltk.stem.snowball.PorterStemmer(ignore_stopwords=False).stem
my_punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~•@'

# cleaning master function
def clean_tweet(tweet, bigrams=False):
    tweet = remove_users(tweet)
    tweet = remove_links(tweet)
    tweet = tweet.lower() # lower case
    tweet = re.sub('['+my_punctuation + ']+', ' ', tweet) # strip punctuation
    tweet = re.sub('\s+', ' ', tweet) #remove double spacing
    tweet = re.sub('([0-9]+)', '', tweet) # remove numbers
    tweet_token_list = [word for word in tweet.split(' ')
                            if word not in my_stopwords] # remove stopwords

    tweet_token_list = [word_rooter(word) if '#' not in word else word
                        for word in tweet_token_list] # apply word rooter
    if bigrams:
        tweet_token_list = tweet_token_list+[tweet_token_list[i]+'_'+tweet_token_list[i+1]
                                            for i in range(len(tweet_token_list)-1)]
    tweet = ' '.join(tweet_token_list)
    return tweet

['de', 'la', 'que', 'el', 'en', 'y', 'a', 'los', 'del', 'se', 'las', 'por', 'un', 'para', 'con', 'no', 'una', 'su', 'al', 'lo', 'como', 'más', 'pero', 'sus', 'le', 'ya', 'o', 'este', 'sí', 'porque', 'esta', 'entre', 'cuando', 'muy', 'sin', 'sobre', 'también', 'me', 'hasta', 'hay', 'donde', 'quien', 'desde', 'todo', 'nos', 'durante', 'todos', 'uno', 'les', 'ni', 'contra', 'otros', 'ese', 'eso', 'ante', 'ellos', 'e', 'esto', 'mí', 'antes', 'algunos', 'qué', 'unos', 'yo', 'otro', 'otras', 'otra', 'él', 'tanto', 'esa', 'estos', 'mucho', 'quienes', 'nada', 'muchos', 'cual', 'poco', 'ella', 'estar', 'estas', 'algunas', 'algo', 'nosotros', 'mi', 'mis', 'tú', 'te', 'ti', 'tu', 'tus', 'ellas', 'nosotras', 'vosotros', 'vosotras', 'os', 'mío', 'mía', 'míos', 'mías', 'tuyo', 'tuya', 'tuyos', 'tuyas', 'suyo', 'suya', 'suyos', 'suyas', 'nuestro', 'nuestra', 'nuestros', 'nuestras', 'vuestro', 'vuestra', 'vuestros', 'vuestras', 'esos', 'esas', 'estoy', 'estás', 'está', 'estamos', 'estáis', 'están', 'e

In [223]:
df['clean_tweet'] = df.text.apply(clean_tweet)

In [224]:
from sklearn.feature_extraction.text import CountVectorizer

# the vectorizer object will be used to transform text to vector form
vectorizer = CountVectorizer(max_df=0.9, min_df=25, token_pattern='\w+|\$[\d\.]+|\S+')

# apply transformation
tf = vectorizer.fit_transform(df['clean_tweet']).toarray()

# tf_feature_names tells us what word each column in the matric represents
tf_feature_names = vectorizer.get_feature_names()

In [225]:
from sklearn.decomposition import LatentDirichletAllocation

number_of_topics = 5

model = LatentDirichletAllocation(n_components=number_of_topics, random_state=0)

In [226]:
model.fit(tf)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=5, n_jobs=None,
                          perp_tol=0.1, random_state=0, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [227]:
def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d words" % (topic_idx)]= ['{}'.format(feature_names[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        topic_dict["Topic %d weights" % (topic_idx)]= ['{:.1f}'.format(topic[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
    return pd.DataFrame(topic_dict)

In [228]:
no_top_words = 10
display_topics(model, tf_feature_names, no_top_words)

Unnamed: 0,Topic 0 words,Topic 0 weights,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights,Topic 4 words,Topic 4 weights
0,colombia,305.6,gobierno,259.1,paí,295.8,dio,274.2,ayuda,366.5
1,petro,193.2,estrato,170.6,president,223.7,senador,167.7,gent,214.3
2,mejor,183.4,pueblo,134.2,alcald,188.1,urib,165.1,persona,202.2
3,cuba,182.2,ayuda,130.8,día,144.5,noticia,153.3,meno,182.4
4,salud,179.6,recurso,121.9,colombia,130.9,grand,138.7,alcaldesa,163.5
5,fuerza,153.2,robar,117.3,ahora,126.4,president,137.3,claudia,123.2
6,médico,141.9,dinero,114.3,gent,122.1,buena,132.5,dio,109.6
7,colombiano,137.1,servicio,113.2,caso,114.0,mano,118.1,banco,109.2
8,mucha,131.2,hp,111.2,gobierno,110.6,petro,110.1,empresa,108.9
9,mismo,119.8,plata,100.7,da,105.3,ud,103.7,familia,108.6
