In [1]:
import pandas as pd
import numpy as np
import geopandas as gp
import matplotlib
import matplotlib.pyplot as plt
import random as rd
import seaborn as sns
from geopy.geocoders import Nominatim
from textblob import TextBlob


sns.set_palette("husl")

%matplotlib inline

ModuleNotFoundError: No module named 'geopandas'

In [None]:
tweets = pd.read_csv("tweets_clean_location.csv")
tweets.head()

In [None]:
tweets = tweets.dropna()
tweets = tweets.drop_duplicates()

## Proceso para obtener la latitud, longitud y dirección en case a su locación

In [None]:
"""geolocator = Nominatim(user_agent="orga_datos")
def getGeoData(x):
    l = geolocator.geocode(x, timeout=20)
    if l == None:
        return (None, None, None)
    return (l.address, l.latitude, l.longitude)

tweets["address_latitude_longitude"] = tweets.location.transform(lambda x: getGeoData(x))
tweets.head()"""

## Creación de columna latitud y longitud en el dataframe original

In [None]:
"""tweets["latitude"] = tweets.address_latitude_longitude.transform(lambda x: x[1])
tweets["longitude"] = tweets.address_latitude_longitude.transform(lambda x: x[2])"""

## Proceso para separar la dirección obtenida de la API en dos columnas: ciudad y país

In [None]:
"""def getCleanLocation(x):
    if x[0] == None and x[1] == None and x[2] == None:
        return (None, None)
    
    splittedAddress = x[0].split(",")
    
    if 0 <= len(splittedAddress) <= 1:
        return (None, None)
    
    return (splittedAddress[0], splittedAddress[-1])
    
tweets["city"] = ""
tweets["country"] = ""

for index, row in tweets.iterrows():
    cleanLocation = getCleanLocation(row.address_latitude_longitude)
    tweets.at[index,"city"], tweets.at[index,"country"] = cleanLocation[0], cleanLocation[1]
    
tweets.to_csv("tweets_with_clean_location.csv")"""

In [None]:
tweets["country"] = tweets["country"].apply(lambda x: x.strip())

## Visualización de todos los tweets en base a su longitud y latitud

In [None]:
world = gp.read_file(gp.datasets.get_path('naturalearth_lowres'))

gdf = gp.GeoDataFrame(
    tweets, geometry=gp.points_from_xy(tweets.longitude, tweets.latitude))
ax = gdf.plot(color="k", figsize=(15, 10), zorder=2, markersize=1)

world.plot(ax = ax, figsize=(10, 5), zorder=1, cmap='OrRd', scheme='quantiles')

## Creación de columna color para cada keyword del dataframe existente

In [None]:
tweets["color"] = ""

In [None]:
def generateHexaColor(x):
    return matplotlib.colors.to_hex([rd.random(), rd.random(), rd.random()])

tweetsWithColor = tweets.groupby("keyword").apply(lambda x: generateHexaColor(x)).to_dict()

for index, row in tweets.iterrows():
    if tweetsWithColor[row.keyword] != None:
        tweets.at[index,"color"] = tweetsWithColor[row.keyword]
tweets.head()

## Visualización de todos los tweets por keyword en base a su longitud y latitud

In [None]:
gdf = gp.GeoDataFrame(
    tweets, geometry=gp.points_from_xy(tweets.longitude, tweets.latitude))
ax = gdf.plot(color=gdf["color"], figsize=(15, 10), zorder=2, markersize=1)

world.plot(ax = ax, figsize=(10, 5), zorder=1)

## Visualización de los tweets que están en USA 

In [None]:
USA = tweets[tweets["country"] == "United States of America"]

USAdf = gp.GeoDataFrame(USA, geometry=gp.points_from_xy(USA.longitude, USA.latitude))
USAdf.drop_duplicates("city")

world = gp.read_file(gp.datasets.get_path("naturalearth_lowres"))
ax = world[world.name=="United States of America"].plot(color='white',edgecolor='black', figsize=(15,10))

visu = USAdf.plot(ax=ax, legend=True)

## Ranking de las keywords que tienen tweets con una longitud superior al promedio de longitud de todos los tweets

In [None]:
tweets["keyword"] = tweets["keyword"].str.replace("%20", " ")

In [None]:
tweets["text_length"] = tweets["text"].str.len()

In [None]:
x = tweets.loc[tweets["text_length"] > tweets["text_length"].mean()]

xGroupBy = x.groupby(["keyword"]).agg({"keyword": ["count"]}).reset_index().keyword.nlargest(10, "count")
xGroupBy = xGroupBy.rename(columns={"": "keyword"})
fig, ax = plt.subplots(figsize=(15,10))
ax.tick_params(axis="x", labelsize=15)
ax.tick_params(axis="y", labelsize=15)
g = sns.barplot(x=xGroupBy["count"], y=xGroupBy["keyword"], orient="h")
g.set_title("Keywords with text length > text length mean (of all tweets)", fontsize=15)
g.set_xlabel("Quantity", fontsize=18)
g.set_ylabel("Keyword", fontsize=18)

## Porcentaje de tweets que tienen N salto de líneas (en este caso 3)

In [None]:
keywords = set()

def hasNNewLines(line, n):
    counter = 0
    for c in line:
        if c == "\n":
            counter += 1
    return counter >= n

for index, row in tweets.iterrows():
    if hasNNewLines(row.text, 3) and not ("http" in row.text):
        keywords.add(row.keyword)

x = tweets.groupby("keyword").count()

percentOfPeopleTryingToSendAPoem = (len(keywords) * 100) / x.shape[0]
percentOfPeopleTryingToSendAPoem # Como puedo plotear este valor ? 

## Ranking de keywords donde alguno de sus tweets contienen URLs

In [None]:
filterBy = tweets["text"].str.contains('http')

textWithURL = tweets[filterBy]
textWithURL = textWithURL.groupby("keyword").count().nlargest(10, "text")
textWithURL.reset_index(inplace=True)

fig, ax = plt.subplots(figsize=(15,10))
ax.tick_params(axis="x", labelsize=15)
ax.tick_params(axis="y", labelsize=15)
g = sns.barplot(x=textWithURL["text"], y=textWithURL["keyword"], orient="h")
g.set_title("Keywords where tweets contain URLs", fontsize=15)
g.set_xlabel("Quantity", fontsize=18)
g.set_ylabel("Keyword", fontsize=18)

## Ranking de keywords donde alguno de sus tweets contienen tags

In [None]:
filterByTag = tweets["text"].str.contains("@")

tweetsWithTag = tweets[filterByTag]
tweetsWithTag = tweetsWithTag.groupby("keyword").count().nlargest(10, "text")
tweetsWithTag.reset_index(inplace=True)

fig, ax = plt.subplots(figsize=(15,10))
ax.tick_params(axis="x", labelsize=15)
ax.tick_params(axis="y", labelsize=15)
g = sns.barplot(x=tweetsWithTag["text"], y=tweetsWithTag["keyword"], orient="h")
g.set_title("Keywords where tweets contain tags", fontsize=15)
g.set_xlabel("Quantity", fontsize=18)
g.set_ylabel("Keyword", fontsize=18)

## Ranking de keywords donde alguno de sus tweets contienen hashtags

In [None]:
filterByHashtag = tweets["text"].str.contains("#")

tweetsWithHashtag = tweets[filterByHashtag]
tweetsWithHashtag = tweetsWithHashtag.loc[:, ["keyword","text"]].groupby("keyword").count().nlargest(10, "text")
tweetsWithUpper.reset_index(inplace=True)

fig, ax = plt.subplots(figsize=(15,10))
ax.tick_params(axis="x", labelsize=15)
ax.tick_params(axis="y", labelsize=15)
g = sns.barplot(x=tweetsWithUpper["text"], y=tweetsWithUpper["keyword"], orient="h")
g.set_title("Keywords where tweets contain hashtags", fontsize=15)
g.set_xlabel("Quantity", fontsize=18)
g.set_ylabel("Keyword", fontsize=18)