2008 VAST Challenge - Mini Challenge 3 - Cell phone

In [1]:
import warnings
import pandas as pd
import numpy as np
from bokeh.io import show, output_notebook

In [2]:
output_notebook()

The dataset comes from the data collection Nodobo-2011-01-v1. It contains data gathered during a study of the mobile phone usage of 27 high-school students, from September 2010 to February 2011. This dataset includes 13035 call records.

<h2>Import and clean the data</h2>

In [3]:
#import data
original_data  = pd.read_csv('calls.csv')

In [4]:
#explore the data
original_data .head(10)

Unnamed: 0,user,other,direction,duration,timestamp
0,7610039694,7434677000.0,Incoming,211,Wed Sep 15 19:17:44 +0100 2010
1,7641036117,1666472000.0,Outgoing,31,Mon Feb 11 07:18:23 +0000 1980
2,7641036117,7371326000.0,Incoming,45,Mon Feb 11 07:45:42 +0000 1980
3,7641036117,7681546000.0,Outgoing,10,Mon Feb 11 08:04:42 +0000 1980
4,7641036117,7681546000.0,Outgoing,0,Mon Feb 11 08:05:31 +0000 1980
5,7641036117,7681546000.0,Incoming,0,Mon Feb 11 08:06:18 +0000 1980
6,7641036117,7981268000.0,Outgoing,0,Mon Feb 11 08:06:31 +0000 1980
7,7641036117,7588304000.0,Incoming,124,Thu Sep 09 19:35:37 +0100 2010
8,7981267897,7784426000.0,Outgoing,474,Thu Sep 09 18:43:44 +0100 2010
9,7981267897,7743039000.0,Missed,0,Thu Sep 09 19:51:30 +0100 2010


In [5]:
original_data.shape

(13035, 5)

In [6]:
original_data.describe()

Unnamed: 0,user,other,duration
count,13035.0,13035.0,13035.0
mean,7410616000.0,5.680583e+30,59.863521
std,292980300.0,6.4727550000000004e+32,246.58175
min,7086312000.0,9.0,0.0
25%,7102746000.0,7118311000.0,0.0
50%,7408255000.0,7351039000.0,8.0
75%,7681546000.0,7691641000.0,36.0
max,7981268000.0,7.389999999999999e+34,7173.0


In [7]:
#find out if there are missing data
original_data.isnull().sum()

user         0
other        0
direction    0
duration     0
timestamp    0
dtype: int64

In [8]:
'''Clean the data'''
#remove missed calls
data = original_data[original_data.direction != 'Missed']

#change the direction of incoming calls
incomingRows = data[data.direction == 'Incoming']

for row in incomingRows.iterrows():
    data.set_value(row[0],'user',row[1]['other'])
    data.set_value(row[0],'other',row[1]['user'])

#remove direction
data.drop(['direction'], axis=1, inplace=True)

# rename columns
data.rename(columns={'user': 'from', 'other': 'to'}, inplace=True)

#ignore warning 
warnings.filterwarnings('ignore')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  **kwargs)


In [9]:
data.head()

Unnamed: 0,from,to,duration,timestamp
0,7434677419,7610040000.0,211,Wed Sep 15 19:17:44 +0100 2010
1,7641036117,1666472000.0,31,Mon Feb 11 07:18:23 +0000 1980
2,7371326239,7641036000.0,45,Mon Feb 11 07:45:42 +0000 1980
3,7641036117,7681546000.0,10,Mon Feb 11 08:04:42 +0000 1980
4,7641036117,7681546000.0,0,Mon Feb 11 08:05:31 +0000 1980


In [10]:
data.shape

(11211, 4)

<h2>Prepare the features for machine learning</h2>

In [11]:
print("Data types and their frequency\n{}".format(data.dtypes.value_counts()))


Data types and their frequency
int64      2
float64    1
object     1
dtype: int64


We have 2 object columns need to be converted into numeric feature

In [12]:
object_columns_df = data.select_dtypes(include=['object'])
print(object_columns_df.iloc[0])

timestamp    Wed Sep 15 19:17:44 +0100 2010
Name: 0, dtype: object


In [13]:
type(data['to'][0])

numpy.float64

In [14]:
from sklearn.model_selection import train_test_split


In [15]:
from_duration_data = data[['from','duration']]
from_duration_data.shape

(11211, 2)

In [16]:
#Split the data in 80% training data and 20% testing data
X_train, X_test, y_train, y_test = train_test_split(
    from_duration_data.loc[:,'from'], 
    from_duration_data.loc[:,'duration'], test_size=0.2, random_state=1)

#create a new training data frame
train_data = pd.DataFrame({'X_train': X_train, 'y_train': y_train})

In [17]:
X_train.shape, y_train.shape

((8968L,), (8968L,))

In [18]:
X_test.shape, y_test.shape

((2243L,), (2243L,))

<h2>Gaussian Mixture</h2>
Given a particular caller, can call duration be predicted?

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.mixture import GaussianMixture

In [20]:
#train the training data
gm = GaussianMixture()

#fit the training data
gm.fit(train_data[['X_train']], train_data.y_train)

y_train_pred = gm.predict(train_data[['X_train']])
train_accuracy = np.mean(y_train_pred.ravel() == y_train.ravel()) * 100

X_test = pd.DataFrame({'X_test': X_test})
y_test_pred = gm.predict(X_test)
test_accuracy = np.mean(y_test_pred.ravel() == y_test.ravel()) * 100

train_accuracy

19.714540588760034

In [21]:
test_accuracy

20.909496210432458

<h2> RandomForestClassifier</h2>

In [22]:
from sklearn.ensemble import RandomForestClassifier

#train the training data
rf = RandomForestClassifier()

#fit the training data
rf .fit(train_data[['X_train']], train_data.y_train)

y_train_pred = rf .predict(train_data[['X_train']])
train_accuracy = np.mean(y_train_pred.ravel() == y_train.ravel()) * 100

y_test_pred = rf .predict(X_test)
test_accuracy = np.mean(y_test_pred.ravel() == y_test.ravel()) * 100

train_accuracy

22.870205173951831

In [23]:
test_accuracy

20.597414177440928