In [None]:
# importacion general de librerias y de visualizacion (matplotlib y seaborn)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%config IPCompleter.greedy=True
%matplotlib inline

plt.style.use('default') # haciendo los graficos un poco mas bonitos en matplotlib
#plt.rcParams['figure.figsize'] = (20, 10)

sns.set(style="whitegrid") # seteando tipo de grid en seaborn

pd.options.display.float_format = '{:20,.2f}'.format # suprimimos la notacion cientifica en los outputs

import warnings
warnings.filterwarnings('ignore')

In [None]:
raw_data = pd.read_csv('./train.csv')
raw_data.head()

In [None]:
raw_data.groupby('location').agg({'target':['count','sum']})[('target','sum')].nlargest(10) #Top ten ubicaciones con mas tweets verdaderos

In [None]:
data_grouped_location = raw_data.groupby('location').agg({'target':['count','sum']})
data_grouped_location.columns = ['target_count','target_sum']
data_grouped_location.sort_values(by='target_sum',ascending=False)

In [None]:
droped = data_grouped_location.drop(data_grouped_location[data_grouped_location['target_count'] < 10].index)
droped['target_count'].nsmallest(10)

In [None]:
droped['truth_percentage'] = (droped['target_sum']*100) / droped['target_count']
droped.head()
droped_sorted = droped.sort_values(by='truth_percentage',ascending=False)

In [None]:
g = sns.barplot(x=droped_sorted['truth_percentage'].nlargest(10),y=droped_sorted['truth_percentage'].nlargest(10).index,orient='h')

In [None]:
#Getting the data for location-keyword relationshp analysis

twitterKeywordAndLocation = pd.read_csv('./train.csv',\
                         usecols = ['keyword', 'location'])
twitterKeywordAndLocation.head()

In [None]:
#Some of DataFrame's properties
twitterKeywordAndLocation.info()

In [None]:
twitterKeywordAndLocation.count()

In [None]:
#Changing DataFrmae columns data types in order to apply some operations on them 

twitterKeywordAndLocation['keyword'] = twitterKeywordAndLocation['keyword'].astype('string')
twitterKeywordAndLocation['location'] = twitterKeywordAndLocation['location'].astype('string')

In [None]:
#Cleaning the data

#Dropping null values
twitterKeywordAndLocation.dropna(inplace = True)
twitterKeywordAndLocation.count()

In [None]:
#We consider that those locations containing non-alphanumerical characters are not real
#Dropping false locations
twitterKeywordAndLocation = twitterKeywordAndLocation[~twitterKeywordAndLocation['location'].str.isalnum()]
twitterKeywordAndLocation

In [None]:
#Dropping those locations that don't have a significant amount of keywords
#For that, first we see the average
twitterKeywordAndLocation['location'].value_counts().mean()

In [None]:
#Then, we filter
twitterKeywordAndLocation = twitterKeywordAndLocation.groupby('location').filter(lambda x: len(x) > 4)
twitterKeywordAndLocation.count()

In [None]:
#Visualization

#Counter of keywords
twitterKeywordAndLocation['counter'] = 1
twitterKeywordAndLocation

In [None]:
#Amount of keywords per location
keywordsPerLocation = twitterKeywordAndLocation.groupby('location').agg({'counter' : 'sum'}).sort_values(by = 'counter', ascending = False)
keywordsPerLocation

In [None]:
#barplot

ax = keywordsPerLocation.head(20).plot(kind = 'bar', figsize = (18, 8), rot = 45,\
                                   title = 'Amount of keywords per location',\
                                  color = 'purple')
ax.set_ylabel('Keyword counter', size = 14)
ax.set_xlabel('Location', size = 14)

In [None]:
#Most popular keywords
keywordsPopular = twitterKeywordAndLocation.groupby('keyword').agg({'counter' : 'sum'}).sort_values(by = 'counter', ascending = False)
keywordsPopular.mean()

In [None]:
keywordsPopular = keywordsPopular[keywordsPopular['counter'] > 1]
keywordsPopular.head()

In [None]:
#barplot

ax = keywordsPopular.head(20).plot(kind = 'bar', figsize = (18, 8), rot = 45,\
                                   title = 'Most popular keywords',\
                                  color = 'green')
ax.set_ylabel('Amount', size = 14)
ax.set_xlabel('Keyword', size = 14)

In [None]:
#Comparison between locations with most keywords and most popular keywords
locationAndKeyword = twitterKeywordAndLocation
locationAndKeyword = pd.merge(locationAndKeyword, keywordsPopular, on = 'keyword')
locationAndKeyword['counter'] = locationAndKeyword['counter_x'] + locationAndKeyword['counter_y']
locationAndKeyword = locationAndKeyword.drop(['counter_x', 'counter_y'], axis = 1)
locationAndKeyword.fillna(0)
locationAndKeyword = locationAndKeyword.head(40)

In [None]:
#scatterplot

g = sns.relplot(x = 'keyword', y = 'location', hue = 'keyword', size = 'counter',\
            sizes = (40, 400), alpha = .5, height = 8, data = locationAndKeyword)
g.ax.set_title('Keywords per location', fontsize = 20)
g.set_xlabels('Keyword',fontsize = 15)
g.set_ylabels('Location', fontsize = 15)
g.ax.figure.set_size_inches(28, 8)

In [None]:
#Starting the analisis for the relation between keywords and hashtags
hashForKeywordsAndHashtags = {}
csvFormatted = pd.read_csv('./ToChangeKeywordsAndLocations/withoutEncoding.csv', usecols = ['keyword', 'text', 'target'])
csvFormatted = csvFormatted[csvFormatted['keyword'] != 'unknown']
csvFormatted['keyword'].value_counts().head(20)

In [None]:
def sumHashtagIfNedeed(line, keyword, hashOfKeywords):
    for word in line.split():
        if not word.startswith('#'):
            continue
        word = word.lower().lstrip('#')
        if keyword not in hashOfKeywords:
            hashOfKeywords[keyword] = {}
        hashOfKeywords[keyword][word.lstrip('#')] = hashOfKeywords[keyword].get(word.lstrip('#'), 0) + 1

In [None]:
for index, row in csvFormatted.iterrows():
    sumHashtagIfNedeed(row['text'], row['keyword'], hashForKeywordsAndHashtags)

In [None]:
d = {'keyword': [], 'amount': []}
for key in hashForKeywordsAndHashtags:
    d['keyword'].append(key)
    d['amount'].append(sum(hashForKeywordsAndHashtags[key].values()))
keywordDf = pd.DataFrame(d, columns =['keyword', 'amount'])
keywordDf = keywordDf[keywordDf['amount'] > 15].sort_values(by = ['amount'])
keywordDf

In [None]:
ax = sns.barplot(x = 'keyword', y = 'amount', data = keywordDf);
ax.set_title('Keyword and amount of hashtags', fontsize=20, color = 'red')
ax.set_xlabel('Keywords', fontsize = 18, color = 'red')
ax.set_ylabel('Hashtags used', fontsize = 18, color ='red')
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
ax.tick_params(axis="x", labelsize='large')
ax.tick_params(axis="x", labelsize=16)
ax.tick_params(axis="y", labelsize=16)
ax.figure.set_size_inches(20, 8);
plt.show()

In [None]:
csvWithOnlyKeywordTarget = csvFormatted.drop('text', 1)
csvWithOnlyKeywordTarget = csvWithOnlyKeywordTarget.groupby(['keyword']).agg({'target': ['mean', 'count']})
csvWithOnlyKeywordTarget.columns = csvWithOnlyKeywordTarget.columns.get_level_values(0) + '_' + csvWithOnlyKeywordTarget.columns.get_level_values(1)
csvWithOnlyKeywordTarget = csvWithOnlyKeywordTarget.sort_values(by = ['target_mean']).reset_index() #Hasta aca tengo TODOS los valores de verdad
csvWithOnlyKeywordTarget = pd.merge(csvWithOnlyKeywordTarget, keywordDf, on='keyword', how='inner')
csvWithOnlyKeywordTarget.head(20)

In [None]:
ax = sns.barplot(x = 'keyword', y = 'target_mean', data = csvWithOnlyKeywordTarget);
ax.set_title('Keyword and veracity value', fontsize=20, color = 'red')
ax.set_xlabel('Keywords', fontsize = 18, color = 'red')
ax.set_ylabel('Veracity', fontsize = 18, color ='red')
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
ax.tick_params(axis="x", labelsize='large')
ax.tick_params(axis="x", labelsize=16)
ax.tick_params(axis="y", labelsize=16)
ax.figure.set_size_inches(20, 8);
plt.show()