# Analisis exploratorio de datos (aka E.D.A) 

## Conjunto de datos a explorar: `clicks`

In [1]:
import pandas as pd
import numpy as np

from IPython.display import display, Markdown as md
import seaborn as sns
import matplotlib.pyplot as plt

### Carga de los datos
> Tomamos como precondicion que el campo `created` es una fecha del tipo de dato datetime[64]

In [2]:
 clicks = pd.read_csv("../data/clicks.csv.gzip", parse_dates=["created"], compression='gzip')

### Informacion sobre el conjunto de datos

In [3]:
clicks.shape

(26351, 20)

In [4]:
clicks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26351 entries, 0 to 26350
Data columns (total 20 columns):
advertiser_id      26351 non-null int64
action_id          0 non-null float64
source_id          26351 non-null int64
created            26351 non-null datetime64[ns]
country_code       26351 non-null int64
latitude           26351 non-null float64
longitude          26351 non-null float64
wifi_connection    26351 non-null bool
carrier_id         26340 non-null float64
trans_id           26351 non-null object
os_minor           26339 non-null float64
agent_device       3243 non-null float64
os_major           26339 non-null float64
specs_brand        26351 non-null int64
brand              6235 non-null float64
timeToClick        22977 non-null float64
touchX             23011 non-null float64
touchY             23011 non-null float64
ref_type           26351 non-null int64
ref_hash           26351 non-null int64
dtypes: bool(1), datetime64[ns](1), float64(11), int64(6), object(

### Cantidad de Nulos

In [5]:
clicks.isna().sum()

advertiser_id          0
action_id          26351
source_id              0
created                0
country_code           0
latitude               0
longitude              0
wifi_connection        0
carrier_id            11
trans_id               0
os_minor              12
agent_device       23108
os_major              12
specs_brand            0
brand              20116
timeToClick         3374
touchX              3340
touchY              3340
ref_type               0
ref_hash               0
dtype: int64

## Tratamiento por atributo

### `advertiser_id `

In [6]:
clicks['advertiser_id'].head()

0    2
1    0
2    0
3    2
4    2
Name: advertiser_id, dtype: int64

In [7]:
clicks['advertiser_id'].value_counts()

3    26263
0       70
2       12
7        2
1        2
8        1
5        1
Name: advertiser_id, dtype: int64

> `advertiser_id` es un atributo que contiene valores entre `[0..8]` se procede a cambiar el tipo de la columna a `int8`

In [8]:
clicks['advertiser_id'] = pd.to_numeric(clicks['advertiser_id'], downcast='integer')

In [9]:
clicks.shape

(26351, 20)

### `action_id `

In [10]:
clicks['action_id'].head(10)

0   NaN
1   NaN
2   NaN
3   NaN
4   NaN
5   NaN
6   NaN
7   NaN
8   NaN
9   NaN
Name: action_id, dtype: float64

In [11]:
clicks['action_id'].isna().sum()

26351

In [12]:
clicks['action_id'].unique()

array([nan])

> El atributo o columna `action_id` en todas sus filas repite el valor `nan`, se procede a eliminar la columna.

In [13]:
clicks.drop(columns=['action_id'], inplace=True)

In [14]:
clicks.shape

(26351, 19)

### `source_id `

In [15]:
clicks['source_id'].head(10)

0    4
1    0
2    0
3    3
4    3
5    4
6    1
7    2
8    0
9    1
Name: source_id, dtype: int64

In [16]:
clicks['source_id'].value_counts()

0     17009
1      4298
6      3371
5      1628
2        27
3         9
10        3
7         2
4         2
9         1
8         1
Name: source_id, dtype: int64

> `source_id` es un atributo que contiene valores entre `[0..10]` se procede a cambiar el tipo de la columna a `int8`

In [17]:
clicks['source_id'] = pd.to_numeric(clicks['source_id'], downcast='integer')

In [18]:
clicks.shape

(26351, 19)

### `created`

In [19]:
clicks['created'].head(10)

0   2019-03-06 22:42:12.755
1   2019-03-08 10:24:30.641
2   2019-03-08 15:24:16.069
3   2019-03-06 03:08:51.543
4   2019-03-06 03:32:55.570
5   2019-03-07 18:02:25.833
6   2019-03-12 15:17:04.453
7   2019-03-06 19:15:47.249
8   2019-03-07 23:11:01.934
9   2019-03-07 23:06:30.642
Name: created, dtype: datetime64[ns]

> No se pueden tomar acciones sobre el atributo `created`

### `country_code`

In [20]:
clicks['country_code'].head(10)

0    6333597102633388268
1    6333597102633388268
2    6333597102633388268
3    6333597102633388268
4    6333597102633388268
5    6333597102633388268
6    6333597102633388268
7    6333597102633388268
8    6333597102633388268
9    6333597102633388268
Name: country_code, dtype: int64

In [21]:
clicks['country_code'].value_counts()

6333597102633388268    26351
Name: country_code, dtype: int64

In [22]:
clicks['country_code'].unique()

array([6333597102633388268])

> El atributo o columna `country_code` en todas sus filas repite el valor `6333597102633388268`, se procede a eliminar la columna.

In [23]:
clicks.drop(columns=['country_code'], inplace=True)

### `latitude`

In [24]:
clicks['latitude'].head(10)

0    1.205689
1    1.218924
2    1.205689
3    1.205689
4    1.205689
5    1.209638
6    1.209638
7    1.205393
8    1.205058
9    1.205689
Name: latitude, dtype: float64

> El atributo o columna `latitude` en sus filas `floats`, se procede a bajar la precision.

In [25]:
clicks['latitude'] = pd.to_numeric(clicks['latitude'], downcast='float')

### `longitude`

In [27]:
clicks['longitude'].head(10)

0    1.070234
1    1.071209
2    1.070234
3    1.070234
4    1.070234
5    1.064208
6    1.064208
7    1.077238
8    1.077332
9    1.070234
Name: longitude, dtype: float64

> El atributo o columna `latitude` en sus filas `floats`, se procede a bajar la precision.

In [39]:
clicks['longitude'] = pd.to_numeric(clicks['longitude'], downcast='float')

### `wifi_connection`

In [40]:
clicks['wifi_connection'].head(10)

KeyError: 'wifi_connection'

In [41]:
clicks['wifi_connection'].unique()

KeyError: 'wifi_connection'

In [42]:
clicks['wifi_connection'].value_counts()

KeyError: 'wifi_connection'

> El atributo o columna `wifi_connection` en todas sus filas repite el valor `nan`, se procede a eliminar la columna.

In [43]:
clicks.drop(columns=['wifi_connection'], inplace=True)

KeyError: "['wifi_connection'] not found in axis"

### `carrier_id`

In [46]:
clicks['carrier_id'].head(10)

0     1.0
1     4.0
2     6.0
3    45.0
4    45.0
5    27.0
6    27.0
7     NaN
8    19.0
9    18.0
Name: carrier_id, dtype: float64

In [48]:
clicks.fillna(value={'carrier_id':0}, inplace=True)
clicks['carrier_id'] = pd.to_numeric(clicks['carrier_id'], downcast='inge')
clicks['carrier_id'].head(10)

0     1.0
1     4.0
2     6.0
3    45.0
4    45.0
5    27.0
6    27.0
7     0.0
8    19.0
9    18.0
Name: carrier_id, dtype: float64

In [None]:
# Config
%matplotlib inline
#pd.options.display.max_columns = None
#plt.figure(figsize=(10, 10));
fig, ax = plt.subplots(figsize=(12,5))

ax.set_xlabel('dates')
ax.set_ylabel('auction counts')

plt.bar(counts_by_day.index, counts_by_day.values)
plt.show()