In [1]:
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import TruncatedSVD
import numpy as np

Vendor:  Continuum Analytics, Inc.
Package: mkl
Message: trial mode expires in 30 days


## Data prep

In [2]:
f = open("../data/spanish_stop_words.txt", 'rU')
spanish_stop = []
for line in f:
    spanish_stop.append(line.rstrip())
f.close()
spanish_stop[:6]

['un', 'una', 'unas', 'unos', 'uno', 'sobre']

In [3]:
def get_tweets(input_path):
    docs = []
    f = open(input_path, 'rU')
    for line in f:
        docs.append(line.rstrip())       
    f.close()    
    return docs

In [4]:
bot_tweets = get_tweets("../data/tweets/botTexts.txt")
human_tweets = get_tweets("../data/tweets/humanText.txt")

print "number of bot tweets loaded", len(bot_tweets)
print "nuber of human tweets loaded", len(human_tweets)

number of bot tweets loaded 9208
nuber of human tweets loaded 33416


In [5]:
clean_bot_tweets = []
for i, bot_tweet in enumerate(bot_tweets):
    if i%2 == 0:
        clean_bot_tweets.append(bot_tweet)

In [6]:
clean_human_tweets = []
for i, human_tweet in enumerate(human_tweets):
    if i%2 == 0:
        clean_human_tweets.append(human_tweet)

In [7]:
print "number of human tweets", len(clean_human_tweets)
print "number of bot tweets", len(clean_bot_tweets)


number of human tweets 16708
number of bot tweets 4604


In [13]:
thefile = open('../data/clean_bot_tweets.txt', 'w')
for tweet in clean_bot_tweets:
    thefile.write("%s\n" % tweet)
thefile = open('../data/clean_human_tweets.txt', 'w')
for tweet in clean_human_tweets:
    thefile.write("%s\n" % tweet)

## TF-IDF and LSI definitions

In [239]:
def LSI(input_human_tweets, input_bot_tweets, stop_words = None):
    tweets = input_bot_tweets + input_human_tweets
    count_vect = CountVectorizer(min_df = 1, analyzer = "word", stop_words=stop_words, ngram_range = (1,2))
    X_train_counts = count_vect.fit_transform(tweets)


    tf_transformer = TfidfTransformer(use_idf=True).fit(X_train_counts)
    X_train_tf = tf_transformer.transform(X_train_counts)
    
    svd = TruncatedSVD(n_components=4, n_iter=7, random_state=42)
    svd_articles = svd.fit_transform(X_train_tf)
    articles_svd_df = pd.DataFrame(svd_articles)
    articles_svd_df.rename(columns=lambda x: "pc" + str(x), inplace=True)
    
    articles_svd_df["source"] = "human"    
    articles_svd_df["source"][:len(input_bot_tweets)] = "bot"
    articles_svd_df["tweet"] = tweets
    
    return articles_svd_df

## LSI on raw tweets

In [209]:
raw_tweets_LSI = LSI(input_human_tweets = clean_human_tweets, input_bot_tweets = clean_bot_tweets)
#raw_tweets_LSI.to_csv("../results/LSI.csv", index = False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


## Remove retweets

In [67]:
clean_bot_tweets[3][:2] == 'RT'

False

In [99]:
no_rt_bot_tweets = []
for tweet in clean_bot_tweets:
    if tweet[:2] != "RT":
        no_rt_bot_tweets.append(tweet)


In [72]:
no_rt_human_tweets = []
for clean_human_tweet in clean_human_tweets:
    if clean_human_tweet[:2] != "RT":
        no_rt_human_tweets.append(clean_human_tweet)

In [212]:
no_rt_LSI = LSI(input_human_tweets = no_rt_human_tweets, input_bot_tweets = no_rt_bot_tweets)
#no_rt_LSI.to_csv("../results/LSI-no-rt.csv", index = False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [133]:
total_counts = np.sum(X_train_counts, axis = 0)

In [134]:
total_counts[:5]

matrix([[2, 1, 1, ..., 1, 1, 1]])

In [135]:
np.sort(total_counts,)

matrix([[   1,    1,    1, ..., 3283, 4494, 4759]])

In [166]:
count_vect.get_feature_names()[1540]

u'co'

In [161]:
np.array(np.argsort(-total_counts)).squeeze()[:4]

array([3177, 1540, 6144, 1869])

## Remove URLs

In [222]:
tweet = no_rt_bot_tweets[3]
tweet

'Se Vuelve Tendencia Nacional En Redes Tras Informe De CNDH https://t.co/u8DFQ9hVsC'

In [223]:
re.sub(r'https?:\/\/.*[\r\n]*', '', tweet.lower())

'se vuelve tendencia nacional en redes tras informe de cndh '

In [184]:
url_free_bot_tweets = [re.sub(r'https?:\/\/.*[\r\n]*', '', tweet.lower()) for tweet in no_rt_bot_tweets]
url_free_human_tweets = [re.sub(r'https?:\/\/.*[\r\n]*', '', tweet.lower()) for tweet in no_rt_human_tweets]

In [186]:
url_free_tweets = url_free_bot_tweets + url_free_human_tweets

In [224]:
our_stop_words = spanish_stop + ["tanhuato", 'cndh', 'ddhh']

In [241]:
url_free_LSI = LSI(input_human_tweets = url_free_human_tweets, input_bot_tweets = url_free_bot_tweets, 
                   stop_words= spanish_stop)
url_free_LSI.to_csv("../results/LSI-no-url.csv", index = False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


## count number of words and stop words for bots vs humans

In [140]:
no_rt_bot_tweets[:10]

['Informe sobre Tanhuato corrobora uso de tortura: Martnez Neri',
 'Informe sobre Tanhuato corrobora uso de tortura: Martnez Neri',
 'Informe sobre Tanhuato corrobora uso de tortura: Martnez Neri https://t.co/6nG9wCDx6w',
 'Se Vuelve Tendencia Nacional En Redes Tras Informe De CNDH https://t.co/u8DFQ9hVsC',
 '"Tanhuato" was a trending topic in Mexico at rank 10 for duration 1h:45m .',
 'Informe sobre Tanhuato corrobora uso de tortura: Martnez Neri https://t.co/2ZCFTrI0Ge',
 'Confirma CNDH que Polica Federal ejecut a 22 en Tanhuato Michoacn https://t.co/FB3uQlxDDd',
 'https://t.co/K466ZctXSS RT https://t.co/cIS3jFyWL4',
 '#Reformenlareforma y resolvamos civilizadamente este tema por favor. https://t.co/IuwvYnFNdC',
 'Informe sobre Tanhuato corrobora uso de tortura: Martnez Neri']

In [141]:
no_rt_human_tweets[:10]

['Informe sobre Tanhuato corrobora uso de tortura: Martnez Neri https://t.co/pm1xTKHpq6',
 'Miren cmo est el acoso en Twitter a quienes nos atrevimos a publicar la verdad sobre la masacre de #Tanhuato https://t.co/8Ye4dCyqdK',
 'Leer y saber para decir basta, #CNDH divulga masacre policiaca https://t.co/6d22eFHBtF',
 '@RFormulaQROO cheque los tuit de Isabel Miranda sobre #Tanhuato valen la pena',
 'Abuso de autoridad, peritajes a modo: las claves del informe sobre ejecuciones en Tanhuato https://t.co/V1WMKFtxFe va @Pajaropolitico',
 '@Alejandro_Marti Por qu no inconformarnos con la masacre de Tanhuato Sr. Mart?',
 'https',
 'Ejecuciones en Tanhuato https://t.co/CXfRMOnWSK Opinin de @rivapa',
 'Rechaza CNSeguridadmx ejecuciones arbitrarias en caso #Tanhuato https://t.co/OIRrpMCmJy',
 'Entre #Tanhuato, #Nochixtln, #Ayotzinapa, la #CNTE, @Javier_Duarte, @betoborge, @GoberDuarte  y el 2018, nos estamos hundiendo como Pas.']

In [142]:
bot_num_chars = [len(tweet) for tweet in no_rt_bot_tweets]

## Average Number of characters in human tweets vs bot tweets

In [143]:
human_num_chars = [len(tweet) for tweet in no_rt_human_tweets]

In [146]:
sum(bot_num_chars)/len(no_rt_bot_tweets)

34

In [147]:
sum(human_num_chars)/len(no_rt_human_tweets)

25