In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

In [2]:
plt.rcParams['figure.figsize'] = (20.0, 10.0)

# Features:




    id: ad identifier
    
    click: 0/1 for non-click/click
    
    hour: format is YYMMDDHH, so 14091123 means 23:00 on Sept. 11, 2014 UTC.
    
    C1 -- anonymized categorical variable
    
    banner_pos
    
    site_id
    
    site_domain
    
    site_category
    
    app_id
    
    app_domain
    
    app_category
    
    device_id
    
    device_ip
    
    device_model
    
    device_type
    
    device_conn_type
    
    C14-C21 -- anonymized categorical variables




In [3]:
df = pd.read_csv('../../Data/Mobile/train.csv')

In [4]:
df.head()

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,1.000009e+18,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,2,15706,320,50,1722,0,35,-1,79
1,1.000017e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15704,320,50,1722,0,35,100084,79
2,1.000037e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15704,320,50,1722,0,35,100084,79
3,1.000064e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15706,320,50,1722,0,35,100084,79
4,1.000068e+19,0,14102100,1005,1,fe8cc448,9166c161,0569f928,ecad2386,7801e8d9,...,1,0,18993,320,50,2161,0,35,-1,157


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40428967 entries, 0 to 40428966
Data columns (total 24 columns):
id                  float64
click               int64
hour                int64
C1                  int64
banner_pos          int64
site_id             object
site_domain         object
site_category       object
app_id              object
app_domain          object
app_category        object
device_id           object
device_ip           object
device_model        object
device_type         int64
device_conn_type    int64
C14                 int64
C15                 int64
C16                 int64
C17                 int64
C18                 int64
C19                 int64
C20                 int64
C21                 int64
dtypes: float64(1), int64(14), object(9)
memory usage: 7.2+ GB


In [6]:
df.memory_usage()

Index                      80
id                  323431736
click               323431736
hour                323431736
C1                  323431736
banner_pos          323431736
site_id             323431736
site_domain         323431736
site_category       323431736
app_id              323431736
app_domain          323431736
app_category        323431736
device_id           323431736
device_ip           323431736
device_model        323431736
device_type         323431736
device_conn_type    323431736
C14                 323431736
C15                 323431736
C16                 323431736
C17                 323431736
C18                 323431736
C19                 323431736
C20                 323431736
C21                 323431736
dtype: int64

In [7]:
def reduce_mem_usage(props):
    start_mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in props.columns:
        if props[col].dtype != object:  # Exclude strings
            
            # Print current column type
            print("******************************")
            print("Column: ",col)
            print("dtype before: ",props[col].dtype)
            
            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()
            
            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(props[col]).all(): 
                NAlist.append(col)
                props[col].fillna(mn-1,inplace=True)  
                   
            # test if column can be converted to an integer
            asint = props[col].fillna(0).astype(np.int64)
            result = (props[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True

            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)    
            
            # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float32)
            
            # Print new column type
            print("dtype after: ",props[col].dtype)
            print("******************************")
    
    # Print final result
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return props, NAlist

In [8]:
dfred = reduce_mem_usage(df)

Memory usage of properties dataframe is : 7402.765029907227  MB
******************************
Column:  id
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  click
dtype before:  int64
dtype after:  uint8
******************************
******************************
Column:  hour
dtype before:  int64
dtype after:  uint32
******************************
******************************
Column:  C1
dtype before:  int64
dtype after:  uint16
******************************
******************************
Column:  banner_pos
dtype before:  int64
dtype after:  uint8
******************************
******************************
Column:  device_type
dtype before:  int64
dtype after:  uint8
******************************
******************************
Column:  device_conn_type
dtype before:  int64
dtype after:  uint8
******************************
******************************
Column:  C14
dtype before:  int64
dtype after:  uint16
***

In [9]:
del df

In [10]:
df = dfred[0]

del dfred

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40428967 entries, 0 to 40428966
Data columns (total 24 columns):
id                  float32
click               uint8
hour                uint32
C1                  uint16
banner_pos          uint8
site_id             object
site_domain         object
site_category       object
app_id              object
app_domain          object
app_category        object
device_id           object
device_ip           object
device_model        object
device_type         uint8
device_conn_type    uint8
C14                 uint16
C15                 uint16
C16                 uint16
C17                 uint16
C18                 uint8
C19                 uint16
C20                 int32
C21                 uint16
dtypes: float32(1), int32(1), object(9), uint16(7), uint32(1), uint8(5)
memory usage: 3.9+ GB


In [12]:
device_types = df.device_type.unique()
device_types

array([1, 0, 4, 5, 2], dtype=uint64)

In [13]:
pos = df.banner_pos.unique()
pos

array([0, 1, 4, 5, 2, 7, 3], dtype=uint64)

In [14]:
device_models = df.device_model.unique()
len(device_models)

8251

In [15]:
df['device_type'].value_counts()

1    37304667
0     2220812
4      774272
5      129185
2          31
Name: device_type, dtype: int64

In [16]:
df0 = df[df['device_type']==0]
df1 = df[df['device_type']==1]
df2 = df[df['device_type']==2]
df4 = df[df['device_type']==4]
df5 = df[df['device_type']==5]

del df

In [None]:


plt.subplot(2, 3, 1)
sns.countplot(x="click", data=df0, palette="Blues")
plt.title('Device 0');

plt.subplot(2, 3, 2)
sns.countplot(x="click", data=df1, palette="Blues")
plt.title('Device 1');

plt.subplot(2, 3, 3)
sns.countplot(x="click", data=df2, palette="Blues")
plt.title('Device 2');

plt.subplot(2, 3, 4)
sns.countplot(x="click", data=df4, palette="Blues")
plt.title('Device 4');

plt.subplot(2, 3, 5)
sns.countplot(x="click", data=df5, palette="Blues")
plt.title('Device 5');



In [None]:
# Plot time
sns.lineplot(x="hour", y="click",
             hue="device_conn_type",
             data=df5)

In [19]:
df2h = df2
df2h['hour'] = df2['hour'].astype('str')
df2h['hour']

30540349    14102815
30546560    14102815
30562664    14102815
30570648    14102815
30572220    14102815
30599240    14102815
30619628    14102815
30647502    14102815
30749595    14102815
30757170    14102815
30800865    14102815
30822145    14102816
30833707    14102816
30844358    14102816
30869331    14102816
30929782    14102816
30961333    14102816
30963213    14102816
31020929    14102816
31033989    14102816
31055732    14102816
31076278    14102816
31149600    14102817
31170217    14102817
31209171    14102817
31217354    14102817
31230906    14102817
31268325    14102817
31273918    14102817
31320380    14102817
31364447    14102817
Name: hour, dtype: object

In [17]:
def years(x):
    x = x[0:2]
    return x

def month(x):
    x = x[2:4]
    return x

def day(x):
    x = x[4:6]
    return x

def hour(x):
    x = x[6:8]
    return x

In [18]:
df2.head()

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
30540349,1.099329e+19,0,14102815,1012,0,60e4b83b,4c984583,50e219e0,ecad2386,7801e8d9,...,2,0,15908,320,50,1752,3,297,100081,82
30546560,1.135895e+19,1,14102815,1012,1,70a14934,f1cfd084,f028772b,ecad2386,7801e8d9,...,2,0,15908,320,50,1752,3,297,100081,82
30562664,1.229828e+19,0,14102815,1012,0,83988d26,8a1a19c5,f028772b,ecad2386,7801e8d9,...,2,0,15908,320,50,1752,3,297,100081,82
30570648,1.277593e+19,0,14102815,1012,0,70a14934,057254ad,f028772b,ecad2386,7801e8d9,...,2,0,15908,320,50,1752,3,297,100081,82
30572220,1.286858e+19,0,14102815,1012,3,625d5547,3af756ff,335d28a8,ecad2386,7801e8d9,...,2,0,21776,320,50,2509,0,553,100151,101


In [19]:
df2h = df2
df2h['hour'] = df2['hour'].astype('str')

df2h['year'] = df2h['hour'].apply(years)
df2h['month'] = df2h['hour'].apply(month)
df2h['day'] = df2h['hour'].apply(day)
df2h['hour1'] = df2h['hour'].apply(hour)
df2h.head()

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,...,C16,C17,C18,C19,C20,C21,year,month,day,hour1
30540349,1.099329e+19,0,14102815,1012,0,60e4b83b,4c984583,50e219e0,ecad2386,7801e8d9,...,50,1752,3,297,100081,82,14,10,28,15
30546560,1.135895e+19,1,14102815,1012,1,70a14934,f1cfd084,f028772b,ecad2386,7801e8d9,...,50,1752,3,297,100081,82,14,10,28,15
30562664,1.229828e+19,0,14102815,1012,0,83988d26,8a1a19c5,f028772b,ecad2386,7801e8d9,...,50,1752,3,297,100081,82,14,10,28,15
30570648,1.277593e+19,0,14102815,1012,0,70a14934,057254ad,f028772b,ecad2386,7801e8d9,...,50,1752,3,297,100081,82,14,10,28,15
30572220,1.286858e+19,0,14102815,1012,3,625d5547,3af756ff,335d28a8,ecad2386,7801e8d9,...,50,2509,0,553,100151,101,14,10,28,15


In [20]:
import sys

# These are the usual ipython objects, including this one you are creating
ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']

# Get a sorted list of the objects and their sizes
sorted([(x, sys.getsizeof(globals().get(x))) for x in dir() if not x.startswith('_') and x not in sys.modules and x not in ipython_vars], key=lambda x: x[1], reverse=True)

[('df1', 23278112232),
 ('df0', 1385786712),
 ('df4', 483145752),
 ('df5', 80611464),
 ('device_models', 66104),
 ('df2', 28575),
 ('df2h', 28575),
 ('pos', 152),
 ('day', 136),
 ('device_types', 136),
 ('hour', 136),
 ('month', 136),
 ('reduce_mem_usage', 136),
 ('years', 136),
 ('np', 80),
 ('pd', 80),
 ('plt', 80),
 ('sns', 80),
 ('stats', 80)]

In [24]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 37304667 entries, 0 to 40428966
Data columns (total 24 columns):
id                  float32
click               uint8
hour                uint32
C1                  uint16
banner_pos          uint8
site_id             object
site_domain         object
site_category       object
app_id              object
app_domain          object
app_category        object
device_id           object
device_ip           object
device_model        object
device_type         uint8
device_conn_type    uint8
C14                 uint16
C15                 uint16
C16                 uint16
C17                 uint16
C18                 uint8
C19                 uint16
C20                 int32
C21                 uint16
dtypes: float32(1), int32(1), object(9), uint16(7), uint32(1), uint8(5)
memory usage: 3.9+ GB


In [None]:
df2h = df2
df2h['hour'] = df2['hour'].astype('str')

df2h['year'] = df2h['hour'].apply(years)
df2h['month'] = df2h['hour'].apply(month)
df2h['day'] = df2h['hour'].apply(day)
df2h['hour1'] = df2h['hour'].apply(hour)
df2h.head()

In [None]:
df5h = splithour(df5.hour)
#df5h = pd.concat([df5['click'], df5h], axis=1, sort=False)
df5h.head()

In [None]:
df4h = splithour(df4)


In [None]:
df1h = splithour(df1)


In [None]:
df0h = splithour(df0)


In [None]:
df4h

In [None]:
31 / 1000