#  Airline Twitter 

## Fuente de información

fuente: https://www.kaggle.com/crowdflower/twitter-airline-sentiment  
  
Un trabajo de análisis de sentimientos sobre los problemas de cada una de las principales aerolíneas estadounidenses. Los datos de Twitter se extrajeron de febrero de 2015 y se les pidió a los contribuyentes que primero clasificaran los tweets positivos, negativos y neutrales, y luego clasificaran las razones negativas (como "vuelo tardío" o "servicio grosero"). 

## Objetivo del proyecto

Reconocer patrones en los datos para poder aportar mayor valor al negocio y generar un modelo que pueda ser capaz de reconocer el sentimiento que está presente en los tweets que estén relacionados a las aerolíneas de USA.

## Cargando los datos

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("datos/Tweets.csv", sep=",", encoding="latin1")
df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [3]:
#numero de registros en el dataset
df.shape

(14640, 15)

## Análisis de datos

### Cantidad de tweets por sentimientos

In [4]:
import altair as alt

In [5]:
source = pd.DataFrame({
    'clases':['negative', 'neutral', 'positive'],
    'tweets': list(df["airline_sentiment"].value_counts())
})
alt.Chart(source).mark_bar().encode(

    x=alt.X('clases',axis=alt.Axis(
                                    labelAngle=0,
                                    )),
    y='tweets',
    tooltip=[
        alt.Tooltip('tweets:Q', title="Total tweets"),
    ]
).properties(
    width=400,
    height=300
)

### Cantidad total de tweets por aerolíneas

In [6]:
aerolineas = df["airline"].value_counts()
source = pd.DataFrame({
    'aerolineas':aerolineas.index,
    'tweets': aerolineas.values
})
alt.Chart(source).mark_bar().encode(

    x=alt.X('aerolineas',axis=alt.Axis(
                                    labelAngle=-45,
                                    )),
    y='tweets',
    tooltip=[
        alt.Tooltip('tweets:Q', title="Total tweets"),
    ]
).properties(
    width=400,
    height=300
)

### Cantidades de tweets de sentimiento por aerolíneas

In [7]:
df_filter = df[["airline_sentiment", "airline"]]
# agrupacion de sentiment por aerolineas
serie = df_filter.groupby(["airline","airline_sentiment"])["airline"].count()
df_airline_sent = pd.DataFrame(columns=["airline", "sentiment", "cantidad"])
for air, atr in serie.index:
    df_airline_sent.loc[df_airline_sent.shape[0]] = {
        "airline": air,
        "sentiment": atr,
        "cantidad": serie[air][atr],
    }

In [8]:
gp_chart = alt.Chart(df_airline_sent).mark_bar().encode(
  alt.Column('airline'), 
  alt.X('sentiment', axis=alt.Axis(
                                    labelAngle=-45,
                                    )),
  alt.Y('cantidad', axis=alt.Axis(grid=False)),
  alt.Color('airline'),
    tooltip=[
      alt.Tooltip('cantidad:Q', title="Total tweets"),
  ]
)
  
gp_chart.display()

### Cantidad de incidencias en total

In [9]:
incidencias = df["negativereason"].value_counts()
source = pd.DataFrame({
    'incidencias':incidencias.index,
    'tweets': incidencias.values
})
alt.Chart(source).mark_bar().encode(

    x=alt.X('incidencias',axis=alt.Axis(
                                    labelAngle=-45,
                                    )),
    y='tweets',
    tooltip=[
        alt.Tooltip('tweets:Q', title="Total tweets"),
    ]
).properties(
    width=400,
    height=300
)

### Porcentaje de Incidencias por empresa

In [10]:
df_filter = df[["negativereason", "airline"]]
serie = df_filter.groupby(["airline","negativereason"])["airline"].count()
df_airline_reason = pd.DataFrame(columns=["airline", "negativereason", "cantidad"])
for air, atr in serie.index:
    valor = np.round((serie[air][atr]/serie[air].sum())*100,1)
    df_airline_reason.loc[df_airline_reason.shape[0]] = {
        "airline": air,
        "negativereason": atr,
        "cantidad": valor,
    }

In [11]:
alt.Chart(df_airline_reason).mark_rect().encode(
    x='airline:O',
    y='negativereason:O',
    tooltip=[
        alt.Tooltip('cantidad:Q', title="% issue"),
    ],
    color='cantidad:Q'
).properties(
    width=400,
    height=400
)

### linea temporal de todos los tweets

In [12]:
df["tweet_created"] = df["tweet_created"].astype("datetime64[ns]")

In [13]:
#crear un dataframe para poder crear un grupo
df_fecha = pd.DataFrame()
df_fecha["year"] = df["tweet_created"].dt.year
df_fecha["month"] = df["tweet_created"].dt.month
df_fecha["day"] = df["tweet_created"].dt.day
df_fecha["hour"] = df["tweet_created"].dt.hour

In [14]:
#agrupando por hora los tweets
#airline after:2015-02-24 before:2015-02-17
grupo_hora = df_fecha.groupby(["year", "month", "day", "hour"])

In [15]:
# contar los tweets por hora
serie_tiempo = grupo_hora["hour"].count()

In [16]:
df_x_time = serie_tiempo.index.to_frame(index=None)

In [17]:
df_fecha_tweets = pd.DataFrame()
df_fecha_tweets["fecha"] = pd.to_datetime(df_x_time)
df_fecha_tweets["cantidad"] = serie_tiempo.values

In [18]:
alt.Chart(df_fecha_tweets).mark_line().encode(
    x='fecha:T',
    y='cantidad:Q'
).properties(
    width=600,
    height=300
)

### linea temporal de todos los tweets del dia 18

In [19]:
dia = df_fecha_tweets[(df_fecha_tweets["fecha"] > "2015-02-18") &
                      (df_fecha_tweets["fecha"] < "2015-02-19") ]
alt.Chart(dia).mark_line().encode(
    x='fecha:T',
    y='cantidad:Q'
).properties(
    width=600,
    height=300
)

### linea temporal de los sentimientos de los tweets

In [20]:
df_fecha_sent = df_fecha
df_fecha_sent["sentiment"] = df["airline_sentiment"]
grupo_hora_sent = df_fecha_sent.groupby(["year", "month", "day", "hour", "sentiment"])
serie_tiempo = grupo_hora_sent["sentiment"].count()

In [21]:
df_x_time = serie_tiempo.index.to_frame(index=None)

In [22]:
df_fecha_tweets_sent = pd.DataFrame()
df_fecha_tweets_sent["sentiment"] = df_x_time["sentiment"]
df_fecha_tweets_sent["fecha"] = pd.to_datetime(df_x_time[["year","month","day", "hour"]])
df_fecha_tweets_sent["cantidad"] = serie_tiempo.values

In [23]:
alt.Chart(df_fecha_tweets_sent).mark_line().encode(
    x='fecha:T',
    y='cantidad:Q',
    color='sentiment:N'
).properties(
    width=800,
    height=300
)

### linea temporal de los sentimientos de los tweets del dia 18

In [24]:
dia = df_fecha_tweets_sent[(df_fecha_tweets_sent["fecha"] > "2015-02-18") &
                      (df_fecha_tweets_sent["fecha"] < "2015-02-19") ]
alt.Chart(dia).mark_line().encode(
    x='fecha:T',
    y='cantidad:Q',
    color='sentiment:N'
).properties(
    width=800,
    height=300
)

###  Promedio las incidencias de los 7 días en una franja de 24 horas. 
  
  El propósito de la gráfica es mostrar si alguna incidencia aumentaba en un horario distinto a otra, pero al parecer todas aumentan y decaen en el mismo horario.

In [25]:
df_hora_reason = df_fecha[["hour"]]
df_hora_reason["negativereason"] = df["negativereason"]
df_hora_reason = df_hora_reason[df["airline_sentiment"] == "negative"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_hora_reason["negativereason"] = df["negativereason"]


In [26]:
grupo_hora_reason = df_hora_reason.groupby(["hour", "negativereason"])
serie_tiempo = grupo_hora_reason["hour"].count()
serie_tiempo= serie_tiempo/7 # 7 dias de muestreo

In [27]:
df_x_time = serie_tiempo.index.to_frame(index=None)

In [28]:
df_fecha_tweets_reason = pd.DataFrame()
df_fecha_tweets_reason["negativereason"] = df_x_time["negativereason"]
df_fecha_tweets_reason["hora"] = df_x_time["hour"]
df_fecha_tweets_reason["cantidad"] = serie_tiempo.values

In [29]:
alt.Chart(df_fecha_tweets_reason).mark_line().encode(
    x='hora',
    y='cantidad:Q',
    color='negativereason:N',
    tooltip=[
        alt.Tooltip('cantidad:Q', title="issue"),
    ]
).properties(
    width=800,
    height=300
)

### Mapa de calor de los tweets en cada estado

In [30]:
from vega_datasets import data

In [31]:
pop = data.population_engineers_hurricanes()
pop.head()

Unnamed: 0,state,id,population,engineers,hurricanes
0,Alabama,1,4863300,0.003422,22
1,Alaska,2,741894,0.001591,0
2,Arizona,4,6931071,0.004774,0
3,Arkansas,5,2988248,0.00244,0
4,California,6,39250017,0.007126,0


In [32]:
pop = pop.drop(['population', 'engineers', 'hurricanes'], axis=1)

#### Dataset de ciudades y estados

https://github.com/grammakov/USA-cities-and-states

In [36]:
bp_data = pd.read_csv("datos/us_cities_states_counties.csv", sep="|")
bp_data.head(2)

Unnamed: 0,City,State short,State full,County,City alias
0,Holtsville,NY,New York,SUFFOLK,Internal Revenue Service
1,Holtsville,NY,New York,SUFFOLK,Holtsville


In [None]:
#eliminando columnas innecesarias
bp_data = bp_data.drop(['County', 'City alias'], axis=1)
#eliminando filas repetidas
bp_data = bp_data.drop_duplicates()
#eliminando filas en nulos
bp_data=bp_data[bp_data["City"].notna()]
bp_data.head()

#### Extrayendo las ciudades con nombres validos 
  
Uso el nombre de las ciudades del dataset y busca coincidencias dentro de los tweets que tenga una localización

In [None]:
import re

In [None]:
location = df["tweet_location"].value_counts()
index_loc = pd.Series(location.index)
index_loc=index_loc[index_loc.notna()]

df_cities = pd.DataFrame(columns=["state", "cantidad"])

index_loc = list(index_loc)
for i_city in range(bp_data.shape[0]):
    fila = bp_data.iloc[i_city]
    for loc in index_loc:
        if re.search(fila["City"], loc, re.IGNORECASE):
            df_cities.loc[df_cities.shape[0]] = {
                "state": fila["State full"],
                "cantidad": location[loc]
            }
            index_loc.remove(loc)
            break

#### Agrupacion por estados

In [None]:
states_issue = df_cities.groupby("state").aggregate(
    cantidad = ("cantidad", sum)
)
states_issue = states_issue.reset_index()
states_issue.head()

#### Join entre el dataset de huracanes y el dataset creado con las cantidad de tweet
  
El dataset de los huracanes tiene el id de cada estado pero no tiene las cuidades, asi que utilice el nombre del estado para hacer el join entre ambos dataset

In [None]:
states_issue_merge = pd.merge(states_issue,pop,on='state')
states_issue_merge.sort_values(by=['cantidad'], ascending=False)[["state", "cantidad"]].iloc[:10]

In [None]:
states = alt.topo_feature(data.us_10m.url, 'states')

alt.Chart(states).mark_geoshape().encode(
    color='cantidad:Q',
    tooltip=[
        alt.Tooltip('state:O'),
        alt.Tooltip('cantidad:Q', title="tweets"),
    ],
).transform_lookup(
    lookup='id',
    from_=alt.LookupData(states_issue_merge, 'id', 
                         list(states_issue_merge.columns))
).properties(
    width=500,
    height=300
).project(
    type='albersUsa'
)

## Modelo Clasificación de sentimiento

### Dividiendo el dataset para validar el modelo

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
target = "airline_sentiment"

In [None]:
rest, test = train_test_split(df, test_size=0.2, stratify=df[target])
train, val = train_test_split(rest, test_size=0.2, stratify=rest[target])
len(train), len(val), len(test)

In [None]:
def normalizarTarget(df):
    train_y = df[target]
    train_y = np.where(train_y == "negative", 1, train_y)
    train_y = np.where(train_y == "neutral", 0, train_y)
    train_y = np.where(train_y == "positive", 2, train_y)
    return train_y.astype('int')

In [None]:
#negative: 1
#neutral : 0
#positive: 2

train_y = normalizarTarget(train)
val_y = normalizarTarget(val)
test_y = normalizarTarget(test)

### Tokenizacion de palabras

In [None]:
import math
import re
#texto en varios formatos, limpiar los texto
from bs4 import BeautifulSoup

In [None]:
def tokenize(tweet):
    tweet = BeautifulSoup(tweet, "lxml").get_text()
    # Eliminamos la @ y su mención
    tweet = re.sub(r"@[A-Za-z0-9]+", '_Entidad_', tweet)
    # Eliminamos los links de las URLs
    tweet = re.sub(r"https?://[A-Za-z0-9./]+", ' ', tweet)
    # Nos quedamos solamente con los caracteres
    tweet = re.sub(r"[^a-zA-Z.!?']", ' ', tweet)
    # Eliminamos espacios en blanco adicionales
    tweet = re.sub(r" +", ' ', tweet)
    return tweet.split()

### One Hot Encoding

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vectorizador_tweet = CountVectorizer(binary=False, analyzer=tokenize, max_features=5000)

In [None]:
vectorizador_tweet.fit(train["text"])

In [None]:
train_vector = vectorizador_tweet.transform(train["text"])
val_vector = vectorizador_tweet.transform(val["text"])
test_vector = vectorizador_tweet.transform(test["text"])

In [None]:
lr = LogisticRegression(max_iter=1000, class_weight="balanced")

In [None]:
lr.fit(train_vector, train_y)

In [None]:
train_pred = lr.predict(train_vector)
val_pred = lr.predict(val_vector)

In [None]:
print(classification_report(train_y, train_pred))

In [None]:
print(classification_report(val_y, val_pred))

### Word Embedding

http://old.tacosdedatos.com/word-to-vec-ilustrado

In [None]:
import multiprocessing

In [None]:
from gensim.models import Word2Vec

In [None]:
cores = multiprocessing.cpu_count() # Count the number of cores in a computer

In [None]:
#https://www.kaggle.com/pierremegret/gensim-word2vec-tutorial
w2v_model = Word2Vec(min_count=3,
                     window=2,
                     vector_size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

In [None]:
train_sentence = [tokenize(train["text"].iloc[i_row]) for i_row in range(train.shape[0])]

In [None]:
w2v_model.build_vocab(train_sentence, progress_per=10000)

In [None]:
w2v_model.train(train_sentence, total_examples=w2v_model.corpus_count, epochs=100, report_delay=1)

#### Creando un nuevos dataset con el embedding


suma todos los vectores de cada una de las palabras del tweet y finalmente obtiene un vector con el promedio, este vector representa al tweet con todas sus palabras. 


In [None]:
def docsVector(embbeding, datset):
    train_sentence = [tokenize(datset["text"].iloc[i_row]) for i_row in range(datset.shape[0])]
    docs_vectors = pd.DataFrame()
    num_dim = embbeding.vector_size
    tam_dataset = len(train_sentence)
    media = None
    for indice in range(tam_dataset):
        lista_palabras = train_sentence[indice]
        temp = []
        for palabra in lista_palabras:
            if palabra in embbeding:
                embbeding_word = np.round(embbeding.__getitem__(palabra),5)
                temp.append(list(embbeding_word))
        if len(temp)>0:
            media = pd.Series(np.array(temp).mean(axis = 0))
            docs_vectors = docs_vectors.append(media, ignore_index = True)
        else:
            array = [0 for i in range(num_dim)]
            docs_vectors = docs_vectors.append(media, ignore_index = True)
    return docs_vectors

In [None]:
docsVector_train = docsVector(w2v_model.wv, train)
docsVector_val = docsVector(w2v_model.wv, val)
docsVector_test = docsVector(w2v_model.wv, test)

In [None]:
def normalizarTarget(df):
    train_y = df[target]
    train_y = np.where(train_y == "negative", 1, train_y)
    train_y = np.where(train_y == "neutral", 0, train_y)
    train_y = np.where(train_y == "positive", 2, train_y)
    return train_y.astype('int')

In [None]:
#negative: 1
#neutral : 0
#positive: 2

train_y = normalizarTarget(train)
val_y = normalizarTarget(val)
test_y = normalizarTarget(test)

#### Regresión lineal

In [None]:
lr = LogisticRegression(max_iter=1000, class_weight="balanced")
lr.fit(docsVector_train, train_y)

In [None]:
train_pred = lr.predict(docsVector_train)

In [None]:
accuracy_score(train_y, train_pred)

In [None]:
print(classification_report(train_y, train_pred))

In [None]:
val_pred = lr.predict(docsVector_val)

In [None]:
accuracy_score(val_y, val_pred)

In [None]:
print(classification_report(val_y, val_pred))

#### Red neuronal multicapa

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
clf_bk = MLPClassifier(solver='sgd', alpha=1e-5,
                    hidden_layer_sizes=(300,100), max_iter=500, random_state=1)

In [None]:
clf_bk.fit(docsVector_train, train_y)

In [None]:
train_pred = clf_bk.predict(docsVector_train)

In [None]:
accuracy_score(train_y, train_pred)

In [None]:
print(classification_report(train_y, train_pred))

In [None]:
val_pred = clf_bk.predict(docsVector_val)

In [None]:
accuracy_score(val_y, val_pred)

In [None]:
print(classification_report(val_y, val_pred))