# EDA 
### Jumbo Data

### Import the libraries

In [45]:
!pip install emoji

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [46]:
import pandas as pd
import numpy as np
import re
import string
import emoji

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Import the data set

In [47]:
df = pd.read_excel("/content/drive/MyDrive/datasets/jumbo-data.xlsx")

In [None]:
df.head()

In [49]:
# rename the columns for easy typing
df.rename(columns={"Case Number": "id", "Case Description": "descr", "Case Origin": "source"}, inplace=True)

## Data Profiling

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
# check for duplicates in the 'descr' column
duplicates = df.duplicated(subset=["id" ,"descr"])
print("Number of duplicates:", duplicates.sum())

In [54]:
df.isnull().sum()

id        9
descr     5
source    9
dtype: int64

In [55]:
# create a boolean mask of null values
mask = df.isnull().any(axis=1)

# select the rows that have at least one null value
rows_with_null = df[mask]

# print the selected rows
print(rows_with_null)

              id                                              descr  source
27726  4503153.0                                                NaN  E-mail
60314        NaN  ? 1070 Bruxelles / Brussel [cid:image002.png@0...     NaN
92512        NaN                                                NaN     NaN
92513        NaN  Here you can find the after-movie of last year...     NaN
92514        NaN                                                NaN     NaN
92515        NaN  Best regards, Florian Hello Jumbo Marketing Te...     NaN
92516        NaN                                                NaN     NaN
92517        NaN  Here you can find the after-movie of last year...     NaN
92518        NaN                                                NaN     NaN
92519        NaN  Best regards, Florian Hi you're chatting with ...     NaN


In [56]:
data = df.dropna()

In [57]:
data.shape

(95912, 3)

In [None]:
data.info()

In [None]:
data["source"].unique()

### Visualize from which source are the most ocurring queries.

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(x="source", data=data)

In [None]:
counts = data["source"].value_counts()
counts

### Explore the text distribution

In [65]:
data["text_length"] = data["descr"].astype(str).apply(len)
data["text_word_count"] = data["descr"].apply(lambda x: len(str(x).split()))

In [None]:
# plot the length of the characters in a case query
sns.distplot(data["text_length"])
plt.xlim([0, 1000]);
plt.xlabel("Number of characters in a case description.");

In [None]:
# plot the number of words in a case query
sns.distplot(data["text_word_count"])
plt.xlim([0, 100]);
plt.xlabel("Number of tokens(words) in a case description.");

### Explore the cases per `Web source`

In [None]:
# select the cases which are coming from the 'Web'
web_cases = data[data.source.isin(["Web"])]
web_cases["descr"].head(25)

In [None]:
web_cases["descr"][0]

In [None]:
web_cases["descr"][235]

In [None]:
web_cases["descr"][263]

In [None]:
web_cases["descr"][553]

In [None]:
web_cases["descr"][810]

### Explore the cases per `Phone source`

In [None]:
# select the cases which are coming from the 'Phone'
phone_cases = data[data.source.isin(["Phone"])]
phone_cases["descr"].head(25)

In [None]:
phone_cases["descr"][9]

In [None]:
phone_cases["descr"][18]

In [None]:
phone_cases["descr"][28]

### Explore the cases per `WhatsApp source`

In [None]:
# select the cases which are coming from the 'Phone'
whapp_cases = data[data.source.isin(["WhatsApp"])]
whapp_cases["descr"].head(25)

In [None]:
whapp_cases["descr"][2]

In [None]:
whapp_cases["descr"][3]

In [None]:
whapp_cases["descr"][4]

In [None]:
whapp_cases["descr"][6]

In [None]:
whapp_cases["descr"][8]

### Explore the cases per `E-mail source`

In [None]:
# select the cases which are coming from the 'Phone'
email_cases = data[data.source.isin(["E-mail"])]
email_cases["descr"].head(25)

In [None]:
email_cases["descr"][34]

In [None]:
email_cases["descr"][48]

In [None]:
email_cases["descr"][274]

In [None]:
email_cases["descr"][315]