# NLP disaster tweets EDA

Super quick EDA

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

In [None]:
data_path = os.path.join("..", "data", "train.csv")
df_train = pd.read_csv(data_path)
df_train.head()

# Shape/Size of data

* 7,613 tweets
* 5 columns

In [None]:
df_train.shape

# Columns?

target:
4342 (57%) not, 3271 (43%) disaster - fairly balanced


* id - totally unique ints
* keyword - 0.8% missing, 221 unique values
    * 0.4% not 1.3% disaster - so missing is slightly predictive of disaster
* location - third missing, nearly half unique, hard to deal with
* text - the text, interestingly not unique... 


In [None]:
df_train.columns

In [None]:
df_train.head()

In [None]:
df_train["target"].value_counts() / 7613

In [None]:
# 7613 rows - all unique
for col in df_train.columns:
    print(col)
    print(
        f"nunique: {df_train[col].nunique()} ({round(100*df_train[col].nunique()/7613)}%)"
    )
    print(
        f"n missing: {df_train[col].isna().sum()} ({round(100*df_train[col].isna().sum()/7613)}%)"
    )

In [None]:
# for those missing, is it helpful?
pd.crosstab(
    df_train["keyword"].isna(), df_train["target"], normalize="columns", margins=True
) * 100

# Examples

### Points of interest:

Cleaning:
* non-ascii characters "don\x89Ûªt", "kill\x89Û_", 
* lots of links
* redacted info - "Suicide bombing at [location named]"

Features:
* containing a link is quite predictive of disaster
* Some have a news channel tag, e.g. "#IndiaNews"
* Tweet length? Short tweets < 80 seem to be more like not a disaster


In [None]:
df_train.loc[df_train["target"] == 1, ["id", "text"]].sample(1).values

In [None]:
# containing a link is quite predictive of disaster
df = df_train.copy()
df["contains_link"] = df["text"].str.contains("http://")
pd.crosstab(df["contains_link"], df["target"], normalize="columns", margins=True) * 100

In [None]:
df = df_train.copy()
df["tweet_length"] = df["text"].str.len()

disaster_lengths = df.loc[df["target"] == 1, "tweet_length"].values
not_disaster_lengths = df.loc[df["target"] == 0, "tweet_length"].values

In [None]:
plt.figure(figsize=(8, 6))
plt.hist(disaster_lengths, bins=40, alpha=0.5, label="disaster")
plt.hist(not_disaster_lengths, bins=40, alpha=0.5, label="not disaster")
plt.xlabel("tweet length", size=14)
plt.ylabel("Count", size=14)
plt.legend(loc="upper left")
plt.show()

In [None]:
df = df_train.copy()
df["tweet_word_length"] = [len(sentence.split()) for sentence in df["text"]]

disaster_lengths = df.loc[df["target"] == 1, "tweet_word_length"].values
not_disaster_lengths = df.loc[df["target"] == 0, "tweet_word_length"].values

df.head()

In [None]:
df["tweet_word_length"].describe()

In [None]:
plt.figure(figsize=(8, 6))
plt.hist(disaster_lengths, bins=30, alpha=0.5, label="disaster")
plt.hist(not_disaster_lengths, bins=30, alpha=0.5, label="not disaster")
plt.xlabel("tweet word length", size=14)
plt.ylabel("Count", size=14)
plt.legend(loc="upper left")
plt.show()