In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder,MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error, log_loss
import operator
import json
from IPython import display
import os
import warnings

np.random.seed(0)
warnings.filterwarnings("ignore")
THRESHOLD = 4

Task: To predict whether the user likes the mobile phone or not. <br>
Assumption: If the average rating of mobile >= threshold, then the user likes it, otherwise not.

<b>Missing values:</b><br>
'Also Known As'(459),'Applications'(421),'Audio Features'(437),'Bezel-less display'(266),'Browser'(449),'Build Material'(338),'Co-Processor'(451),'Display Colour'(457),'Mobile High-Definition Link(MHL)'(472),'Music'(447)
'Email','Fingerprint Sensor Position'(174),'Games'(446),'HDMI'(454),'Heart Rate Monitor'(467),'IRIS Scanner'(467),
'Optical Image Stabilisation'(219),'Other Facilities'(444),'Phone Book'(444),'Physical Aperture'(87),'Quick Charging'(122),'Ring Tone'(444),'Ruggedness'(430),SAR Value(315),'SIM 3'(472),'SMS'(470)', 'Screen Protection'(229),'Screen to Body Ratio (claimed by the brand)'(428),'Sensor'(242),'Software Based Aperture'(473),
'Special Features'(459),'Standby time'(334),'Stylus'(473),'TalkTime'(259), 'USB Type-C'(374),'Video Player'(456),
'Video Recording Features'(458),'Waterproof'(398),'Wireless Charging','USB OTG Support'(159), 'Video ,'Recording'(113),'Java'(471),'Browser'(448)

<b>Very low variance:</b><br>
'Architecture'(most entries are 64-bit),'Audio Jack','GPS','Loudspeaker','Network','Network Support','Other Sensors'(28),'SIM Size', 'VoLTE'


<b>Multivalued:</b><br>
'Colours','Custom UI','Model'(1),'Other Sensors','Launch Date'

<b>Not important:</b><br>
'Bluetooth', 'Settings'(75),'Wi-Fi','Wi-Fi Features'

<b>Doubtful:</b><br>
'Aspect Ratio','Autofocus','Brand','Camera Features','Fingerprint Sensor'(very few entries are missing),
'Fingerprint Sensor Position', 'Graphics'(multivalued),'Image resolution'(multivalued),'SIM Size','Sim Slot(s)', 'User Available Storage', 'SIM 1', 'SIM 2','Shooting Modes', 'Touch Screen'(24), 'USB Connectivity'
    
<b>To check:</b><br>
'Display Type','Expandable Memory','FM Radio'

<b>High Correlation with other features</b><br>
'SIM Slot(s)' high correlation with SIM1
'Weight' has high high correlation with capacity , screen-to-body ratio
'Height' - screen size is also there
    
<b>Given a mobile, we can't directly get these features</b><br>
'Rating Count', 'Review Count'

<b>Keeping:</b><br>
'Capacity','Flash'(17),'Height'(22),'Internal Memory'(20, require cleaning),'Operating System'(25, require cleaning), 'Pixel Density'(1, clean it),'Processor'(22, clean it), 'RAM'(17, clean), 'Rating','Resolution'(cleaning), 'Screen Resolution','Screen Size', 'Thickness'(22), 'Type','User Replaceable','Weight'(cleaning),'Sim Size'(), 'Other Sensors'(28), 'Screen to Body Ratio (calculated)','Width',


In [100]:
# read data from file
train = pd.read_csv("../input/padhai-mp-neuron-like-unlike-classification/train.csv") 
test = pd.read_csv("../input/padhai-mp-neuron-like-unlike-classification/test.csv")


# check the number of features and data points in train
print("Number of data points in train: %d" % train.shape[0])
print("Number of features in train: %d" % train.shape[1])

# check the number of features and data points in test
print("Number of data points in test: %d" % test.shape[0])
print("Number of features in test: %d" % test.shape[1])

Number of data points in train: 355
Number of features in train: 99
Number of data points in test: 119
Number of features in test: 98


In [101]:
train.head()

Unnamed: 0,PhoneId,Also Known As,Applications,Architecture,Aspect Ratio,Audio Features,Audio Jack,Autofocus,Bezel-less display,Bluetooth,...,Video Recording,Video Recording Features,VoLTE,Waterproof,Weight,Wi-Fi,Wi-Fi Features,Width,Wireless Charging,Rating
0,0,,,64 bit,19:9,,3.5 mm,Phase Detection autofocus,yes,v5.0,...,1920x1080 @ 30 fps,,yes,,182 grams Below Average ▾Weight compared to11 ...,"Wi-Fi 802.11, a/b/g/n","Wi-Fi Direct, Mobile Hotspot",76.4 mm,,4.5
1,1,,Oppo Browser,64 bit,19:9,,3.5 mm,Phase Detection autofocus,yes,v4.2,...,1920x1080 @ 30 fps,,yes,,168 grams Average ▾Weight compared to8 - 12 K ...,"Wi-Fi 802.11, b/g/n",Mobile Hotspot,75.6 mm,,4.5
2,2,,,64 bit,19.5:9,,3.5 mm,yes,yes,v4.2,...,,,yes,,168 grams Average ▾Weight compared to10 - 14 K...,"Wi-Fi 802.11, b/g/n",Mobile Hotspot,74 mm,,4.4
3,4,,,64 bit,18.5:9,Dolby Atmos,3.5 mm,No,yes,v5.0,...,1920x1080 @ 30 fps,,yes,,169 grams Average ▾Weight compared to15 - 23 K...,"Wi-Fi 802.11, a/ac/b/g/n","Wi-Fi Direct, Mobile Hotspot",76.8 mm,,4.3
4,5,,,64 bit,19.5:9,,3.5 mm,Phase Detection autofocus,yes,v4.2,...,1920x1080 @ 60 fps,,yes,,175 grams Below Average ▾Weight compared to12 ...,"Wi-Fi 802.11, a/ac/b/g/n",Mobile Hotspot,76.6 mm,,4.4


In [102]:
train.columns

Index(['PhoneId', 'Also Known As', 'Applications', 'Architecture',
       'Aspect Ratio', 'Audio Features', 'Audio Jack', 'Autofocus',
       'Bezel-less display', 'Bluetooth', 'Brand', 'Browser', 'Build Material',
       'Camera Features', 'Capacity', 'Chipset', 'Co-Processor', 'Colours',
       'Custom UI', 'Display Colour', 'Display Type', 'Email',
       'Expandable Memory', 'FM Radio', 'Fingerprint Sensor',
       'Fingerprint Sensor Position', 'Flash', 'GPS', 'Games', 'Graphics',
       'HDMI', 'Heart Rate Monitor', 'Height', 'IRIS Scanner',
       'Image Resolution', 'Internal Memory', 'Java', 'Launch Date',
       'Loudspeaker', 'Mobile High-Definition Link(MHL)', 'Model', 'Music',
       'NFC', 'Network', 'Network Support', 'Operating System',
       'Optical Image Stabilisation', 'Other Facilities', 'Other Sensors',
       'Phone Book', 'Physical Aperture', 'Pixel Density', 'Processor',
       'Quick Charging', 'RAM', 'Rating Count', 'Resolution', 'Review Count',
       'Ring

In [103]:
#Returns a list with the number of null values in the dataset, feature wise
list(train.isnull().sum(axis = 0))

[0,
 338,
 312,
 82,
 151,
 325,
 12,
 22,
 201,
 4,
 1,
 332,
 253,
 7,
 1,
 26,
 334,
 1,
 185,
 340,
 1,
 347,
 2,
 74,
 20,
 136,
 13,
 22,
 331,
 30,
 338,
 350,
 15,
 351,
 7,
 18,
 353,
 24,
 1,
 354,
 1,
 332,
 97,
 1,
 1,
 21,
 165,
 329,
 24,
 329,
 69,
 1,
 19,
 94,
 15,
 0,
 7,
 0,
 329,
 318,
 233,
 1,
 32,
 353,
 20,
 1,
 352,
 169,
 1,
 1,
 15,
 321,
 177,
 56,
 43,
 354,
 343,
 246,
 354,
 191,
 15,
 21,
 2,
 18,
 120,
 278,
 206,
 18,
 341,
 87,
 341,
 82,
 295,
 30,
 21,
 26,
 15,
 330,
 0]

In [104]:
train['Also Known As'].value_counts()

HTC Desire 826 Dual SIM       1
Nokia X7                      1
Blackberry KEY2 LE            1
Oppo RX17                     1
Vivo X20                      1
Xiaomi Mi 6X                  1
Vivo V11 Pro                  1
Vivo V9 Pro                   1
Samsung SM-G7102              1
Samsung Galaxy A9s            1
Oppo RealMe 1                 1
Vivo X20 Plus                 1
Mobiistar XQ Dual             1
HTC Bolt                      1
Samsung Guru Music 2 B310E    1
Honor View 10                 1
Nokia X5                      1
Name: Also Known As, dtype: int64

In [105]:
#Only describale features in the dataset for now are the 4 marked here.

train.describe()

Unnamed: 0,PhoneId,Rating Count,Review Count,Rating
count,355.0,355.0,355.0,355.0
mean,235.267606,33449.79,6648.228169,4.071549
std,137.204594,115166.1,18943.317868,0.372663
min,0.0,1.0,1.0,2.5
25%,113.5,658.5,228.0,3.9
50%,234.0,3744.0,951.0,4.1
75%,358.0,16136.5,4208.5,4.3
max,472.0,1268083.0,218020.0,5.0


In [106]:
train['Rating']

0      4.5
1      4.5
2      4.4
3      4.3
4      4.4
      ... 
350    3.8
351    3.9
352    4.1
353    3.6
354    4.3
Name: Rating, Length: 355, dtype: float64

In [107]:
train['Browser'].isnull().sum()

332

In [108]:
def data_clean(data):
    
    # Let's first remove all missing value features
    columns_to_remove = ['Also Known As','Applications','Audio Features','Bezel-less display'
                         'Browser','Build Material','Co-Processor','Display Colour','Mobile High-Definition Link(MHL)',
                         'Music', 'Email','Fingerprint Sensor Position',
                         'Games','HDMI','Heart Rate Monitor','IRIS Scanner', 
                         'Optical Image Stabilisation','Other Facilities',
                         'Phone Book','Physical Aperture','Quick Charging',
                         'Ring Tone','Ruggedness','SAR Value','SIM 3','SMS',
                         'Screen Protection','Screen to Body Ratio (claimed by the brand)',
                         'Sensor','Software Based Aperture', 'Special Features',
                         'Standby time','Stylus','TalkTime', 'USB Type-C',
                         'Video Player', 'Video Recording Features','Waterproof',
                         'Wireless Charging','USB OTG Support', 'Video Recording','Java']

    columns_to_retain = list(set(data.columns)-set(columns_to_remove))
    data = data[columns_to_retain]

    #Features having very low variance 
    columns_to_remove = ['Architecture','Audio Jack','GPS','Loudspeaker','Network','Network Support','VoLTE']
    columns_to_retain = list(set(data.columns)-set(columns_to_remove))
    data = data[columns_to_retain]

    # Multivalued:
    columns_to_remove = ['Architecture','Launch Date','Audio Jack','GPS','Loudspeaker','Network','Network Support','VoLTE', 'Custom UI']
    columns_to_retain = list(set(data.columns)-set(columns_to_remove))
    data = data[columns_to_retain]

    # Not much important
    columns_to_remove = ['Bluetooth', 'Settings','Wi-Fi','Wi-Fi Features']
    columns_to_retain = list(set(data.columns)-set(columns_to_remove))
    data = data[columns_to_retain]
    
    return data

# Removing features

In [109]:
train = data_clean(train)
test = data_clean(test)

removing all those data points in which more than 15 features are missing 

In [110]:
train = train[(train.isnull().sum(axis=1) <= 15)]
# You shouldn't remove data points from test set
#test = test[(test.isnull().sum(axis=1) <= 15)]

In [111]:
# check the number of features and data points in train
print("Number of data points in train: %d" % train.shape[0])
print("Number of features in train: %d" % train.shape[1])

# check the number of features and data points in test
print("Number of data points in test: %d" % test.shape[0])
print("Number of features in test: %d" % test.shape[1])

Number of data points in train: 341
Number of features in train: 46
Number of data points in test: 119
Number of features in test: 45


In [112]:
train.head()

Unnamed: 0,Brand,Touch Screen,NFC,Chipset,SIM Size,User Replaceable,Image Resolution,Autofocus,Aspect Ratio,Screen Size,...,Graphics,Thickness,Width,PhoneId,Other Sensors,Weight,RAM,SIM 1,Rating Count,Model
0,Xiaomi,"Capacitive Touchscreen, Multi-touch",,Qualcomm Snapdragon 636,"SIM1: Nano, SIM2: Nano (Hybrid)",No,4000 x 3000 Pixels,Phase Detection autofocus,19:9,6.26 inches (15.9 cm),...,Adreno 509,8.2 mm Very Good ▾Thickness compared to11 - 17...,76.4 mm,0,"Light sensor, Proximity sensor, Accelerometer,...",182 grams Below Average ▾Weight compared to11 ...,4 GB Excellent ▾RAM compared to11 - 17 K Phone...,4G Bands:TD-LTE 2300(band 40) / 2500(band 41) ...,172274,Redmi Note 6 Pro
1,Realme,"Capacitive Touchscreen, Multi-touch",No,Qualcomm Snapdragon 450,"SIM1: Nano, SIM2: Nano",No,4128 x 3096 Pixels,Phase Detection autofocus,19:9,6.2 inches (15.75 cm),...,Adreno 506,8.2 mm Very Good ▾Thickness compared to8 - 12 ...,75.6 mm,1,"Light sensor, Proximity sensor, Accelerometer,...",168 grams Average ▾Weight compared to8 - 12 K ...,3 GB Excellent ▾RAM compared to8 - 12 K Phones...,4G Bands:TD-LTE 2600(band 38) / 2300(band 40) ...,339447,2
2,Realme,"Capacitive Touchscreen, Multi-touch",,MediaTek Helio P70,"SIM1: Nano, SIM2: Nano",No,4128 x 3096 Pixels,yes,19.5:9,6.3 inches (16 cm),...,Mali-G72 MP3,8 mm Very Good ▾Thickness compared to10 - 14 K...,74 mm,2,"Light sensor, Proximity sensor, Accelerometer",168 grams Average ▾Weight compared to10 - 14 K...,3 GB Excellent ▾RAM compared to10 - 14 K Phone...,4G Bands:TD-LTE 2600(band 38) / 2300(band 40) ...,1149,U1
3,Samsung,"Capacitive Touchscreen, Multi-touch",No,Samsung Exynos 7 Octa 7885,"SIM1: Nano, SIM2: Nano",No,5288 x 3968 Pixels,No,18.5:9,6.0 inches (15.24 cm),...,Mali-G71 MP2,7.5 mm Very Good ▾Thickness compared to15 - 23...,76.8 mm,4,"Light sensor, Proximity sensor, Accelerometer,...",169 grams Average ▾Weight compared to15 - 23 K...,4 GB Excellent ▾RAM compared to15 - 23 K Phone...,4G Bands:TD-LTE 2600(band 38) / 2300(band 40) ...,6026,Galaxy A7 2018
4,Honor,"Capacitive Touchscreen, Multi-touch",No,HiSilicon Kirin 710,"SIM1: Nano, SIM2: Nano",No,5160 x 3872 Pixels,Phase Detection autofocus,19.5:9,6.5 inches (16.51 cm),...,Mali-G51 MP4,7.8 mm Very Good ▾Thickness compared to12 - 18...,76.6 mm,5,"Light sensor, Proximity sensor, Accelerometer,...",175 grams Below Average ▾Weight compared to12 ...,4 GB Excellent ▾RAM compared to12 - 18 K Phone...,4G Bands:TD-LTE 2300(band 40) FD-LTE 1800(band...,5175,8X


In [113]:
train['Browser'].isnull().sum()

328

# Filling Missing values

In [114]:
def for_integer(test):
    try:
        test = test.strip()
        return int(test.split(' ')[0])
    except IOError:
           pass
    except ValueError:
        pass
    except:
        pass

def for_string(test):
    try:
        test = test.strip()
        return (test.split(' ')[0])
    except IOError:
        pass
    except ValueError:
        pass
    except:
        pass

def for_float(test):
    try:
        test = test.strip()
        return float(test.split(' ')[0])
    except IOError:
        pass
    except ValueError:
        pass
    except:
        pass
def find_freq(test):
    try:
        test = test.strip()
        test = test.split(' ')
        if test[2][0] == '(':
            return float(test[2][1:])
        return float(test[2])
    except IOError:
        pass
    except ValueError:
        pass
    except:
        pass

    
def for_Internal_Memory(test):
    try:
        test = test.strip()
        test = test.split(' ')
        if test[1] == 'GB':
            return int(test[0])
        if test[1] == 'MB':
#             print("here")
            return (int(test[0]) * 0.001)
    except IOError:
           pass
    except ValueError:
        pass
    except:
        pass
    
def find_freq(test):
    try:
        test = test.strip()
        test = test.split(' ')
        if test[2][0] == '(':
            return float(test[2][1:])
        return float(test[2])
    except IOError:
        pass
    except ValueError:
        pass
    except:
        pass


In [115]:
def data_clean_2(x):
    data = x.copy()
    
    data['Capacity'] = data['Capacity'].apply(for_integer)

    data['Height'] = data['Height'].apply(for_float)
    data['Height'] = data['Height'].fillna(data['Height'].mean())

    data['Internal Memory'] = data['Internal Memory'].apply(for_Internal_Memory)

    data['Pixel Density'] = data['Pixel Density'].apply(for_integer)

    data['Internal Memory'] = data['Internal Memory'].fillna(data['Internal Memory'].median())
    data['Internal Memory'] = data['Internal Memory'].astype(int)

    data['RAM'] = data['RAM'].apply(for_integer)
    data['RAM'] = data['RAM'].fillna(data['RAM'].median())
    data['RAM'] = data['RAM'].astype(int)

    data['Resolution'] = data['Resolution'].apply(for_integer)
    data['Resolution'] = data['Resolution'].fillna(data['Resolution'].median())
    data['Resolution'] = data['Resolution'].astype(int)

    data['Screen Size'] = data['Screen Size'].apply(for_float)

    data['Thickness'] = data['Thickness'].apply(for_float)
    data['Thickness'] = data['Thickness'].fillna(data['Thickness'].mean())
    data['Thickness'] = data['Thickness'].round(2)

    data['Type'] = data['Type'].fillna('Li-Polymer')

    data['Screen to Body Ratio (calculated)'] = data['Screen to Body Ratio (calculated)'].apply(for_float)
    data['Screen to Body Ratio (calculated)'] = data['Screen to Body Ratio (calculated)'].fillna(data['Screen to Body Ratio (calculated)'].mean())
    data['Screen to Body Ratio (calculated)'] = data['Screen to Body Ratio (calculated)'].round(2)

    data['Width'] = data['Width'].apply(for_float)
    data['Width'] = data['Width'].fillna(data['Width'].mean())
    data['Width'] = data['Width'].round(2)

    data['Flash'][data['Flash'].isna() == True] = "Other"

    data['User Replaceable'][data['User Replaceable'].isna() == True] = "Other"

    data['Num_cores'] = data['Processor'].apply(for_string)
    data['Num_cores'][data['Num_cores'].isna() == True] = "Other"


    data['Processor_frequency'] = data['Processor'].apply(find_freq)
    #because there is one entry with 208MHz values, to convert it to GHz
    data['Processor_frequency'][data['Processor_frequency'] > 200] = 0.208
    data['Processor_frequency'] = data['Processor_frequency'].fillna(data['Processor_frequency'].mean())
    data['Processor_frequency'] = data['Processor_frequency'].round(2)

    data['Camera Features'][data['Camera Features'].isna() == True] = "Other"

    #simplifyig Operating System to os_name for simplicity
    data['os_name'] = data['Operating System'].apply(for_string)
    data['os_name'][data['os_name'].isna() == True] = "Other"

    data['Sim1'] = data['SIM 1'].apply(for_string)

    data['SIM Size'][data['SIM Size'].isna() == True] = "Other"

    data['Image Resolution'][data['Image Resolution'].isna() == True] = "Other"

    data['Fingerprint Sensor'][data['Fingerprint Sensor'].isna() == True] = "Other"

    data['Expandable Memory'][data['Expandable Memory'].isna() == True] = "No"

    data['Weight'] = data['Weight'].apply(for_integer)
    data['Weight'] = data['Weight'].fillna(data['Weight'].mean())
    data['Weight'] = data['Weight'].astype(int)

    data['SIM 2'] = data['SIM 2'].apply(for_string)
    data['SIM 2'][data['SIM 2'].isna() == True] = "Other"
    
    return data

In [116]:
train = data_clean_2(train)
test = data_clean_2(test)

# check the number of features and data points in train
print("Number of data points in train: %d" % train.shape[0])
print("Number of features in train: %d" % train.shape[1])

# check the number of features and data points in test
print("Number of data points in test: %d" % test.shape[0])
print("Number of features in test: %d" % test.shape[1])

Number of data points in train: 341
Number of features in train: 50
Number of data points in test: 119
Number of features in test: 49


Not very important feature

In [117]:
def data_clean_3(x):
    
    data = x.copy()

    columns_to_remove = ['User Available Storage','SIM Size','Chipset','Processor','Autofocus','Aspect Ratio','Touch Screen',
                        'Bezel-less display','Operating System','SIM 1','USB Connectivity','Other Sensors','Graphics','FM Radio',
                        'NFC','Shooting Modes','Browser','Display Colour' ]

    columns_to_retain = list(set(data.columns)-set(columns_to_remove))
    data = data[columns_to_retain]


    columns_to_remove = [ 'Screen Resolution','User Replaceable','Camera Features',
                        'Thickness', 'Display Type']

    columns_to_retain = list(set(data.columns)-set(columns_to_remove))
    data = data[columns_to_retain]


    columns_to_remove = ['Fingerprint Sensor', 'Flash', 'Rating Count', 'Review Count','Image Resolution','Type','Expandable Memory',\
                        'Colours','Width','Model']
    columns_to_retain = list(set(data.columns)-set(columns_to_remove))
    data = data[columns_to_retain]

    return data

In [118]:
train = data_clean_3(train)
test = data_clean_3(test)

# check the number of features and data points in train
print("Number of data points in train: %d" % train.shape[0])
print("Number of features in train: %d" % train.shape[1])

# check the number of features and data points in test
print("Number of data points in test: %d" % test.shape[0])
print("Number of features in test: %d" % test.shape[1])

Number of data points in train: 341
Number of features in train: 18
Number of data points in test: 119
Number of features in test: 17


In [119]:
# one hot encoding

train_ids = train['PhoneId']
test_ids = test['PhoneId']

cols = list(test.columns)
cols.remove('PhoneId')
cols.insert(0, 'PhoneId')

combined = pd.concat([train.drop('Rating', axis=1)[cols], test[cols]])
print(combined.shape)
print(combined.columns)

combined = pd.get_dummies(combined)
print(combined.shape)
print(combined.columns)

train_new = combined[combined['PhoneId'].isin(train_ids)]
test_new = combined[combined['PhoneId'].isin(test_ids)]

(460, 17)
Index(['PhoneId', 'Brand', 'Screen to Body Ratio (calculated)', 'Resolution',
       'Internal Memory', 'Sim1', 'SIM Slot(s)', 'Height', 'SIM 2',
       'Pixel Density', 'Weight', 'RAM', 'Processor_frequency', 'os_name',
       'Screen Size', 'Num_cores', 'Capacity'],
      dtype='object')
(460, 87)
Index(['PhoneId', 'Screen to Body Ratio (calculated)', 'Resolution',
       'Internal Memory', 'Height', 'Pixel Density', 'Weight', 'RAM',
       'Processor_frequency', 'Screen Size', 'Capacity', 'Brand_10.or',
       'Brand_Apple', 'Brand_Asus', 'Brand_Billion', 'Brand_Blackberry',
       'Brand_Comio', 'Brand_Coolpad', 'Brand_Do', 'Brand_Gionee',
       'Brand_Google', 'Brand_HTC', 'Brand_Honor', 'Brand_Huawei',
       'Brand_InFocus', 'Brand_Infinix', 'Brand_Intex', 'Brand_Itel',
       'Brand_Jivi', 'Brand_Karbonn', 'Brand_LG', 'Brand_Lava', 'Brand_LeEco',
       'Brand_Lenovo', 'Brand_Lephone', 'Brand_Lyf', 'Brand_Meizu',
       'Brand_Micromax', 'Brand_Mobiistar', 'Brand_Mot

In [120]:
train_new = train_new.merge(train[['PhoneId', 'Rating']], on='PhoneId')

In [121]:
# check the number of features and data points in train
print("Number of data points in train: %d" % train_new.shape[0])
print("Number of features in train: %d" % train_new.shape[1])

# check the number of features and data points in test
print("Number of data points in test: %d" % test_new.shape[0])
print("Number of features in test: %d" % test_new.shape[1])

Number of data points in train: 341
Number of features in train: 88
Number of data points in test: 119
Number of features in test: 87


In [122]:
train_new.columns

Index(['PhoneId', 'Screen to Body Ratio (calculated)', 'Resolution',
       'Internal Memory', 'Height', 'Pixel Density', 'Weight', 'RAM',
       'Processor_frequency', 'Screen Size', 'Capacity', 'Brand_10.or',
       'Brand_Apple', 'Brand_Asus', 'Brand_Billion', 'Brand_Blackberry',
       'Brand_Comio', 'Brand_Coolpad', 'Brand_Do', 'Brand_Gionee',
       'Brand_Google', 'Brand_HTC', 'Brand_Honor', 'Brand_Huawei',
       'Brand_InFocus', 'Brand_Infinix', 'Brand_Intex', 'Brand_Itel',
       'Brand_Jivi', 'Brand_Karbonn', 'Brand_LG', 'Brand_Lava', 'Brand_LeEco',
       'Brand_Lenovo', 'Brand_Lephone', 'Brand_Lyf', 'Brand_Meizu',
       'Brand_Micromax', 'Brand_Mobiistar', 'Brand_Moto', 'Brand_Motorola',
       'Brand_Nokia', 'Brand_Nubia', 'Brand_OPPO', 'Brand_OnePlus',
       'Brand_Oppo', 'Brand_Panasonic', 'Brand_Razer', 'Brand_Realme',
       'Brand_Reliance', 'Brand_Samsung', 'Brand_Sony', 'Brand_Spice',
       'Brand_Tecno', 'Brand_Ulefone', 'Brand_VOTO', 'Brand_Vivo',
       'Bran

In [123]:
test_new.columns

Index(['PhoneId', 'Screen to Body Ratio (calculated)', 'Resolution',
       'Internal Memory', 'Height', 'Pixel Density', 'Weight', 'RAM',
       'Processor_frequency', 'Screen Size', 'Capacity', 'Brand_10.or',
       'Brand_Apple', 'Brand_Asus', 'Brand_Billion', 'Brand_Blackberry',
       'Brand_Comio', 'Brand_Coolpad', 'Brand_Do', 'Brand_Gionee',
       'Brand_Google', 'Brand_HTC', 'Brand_Honor', 'Brand_Huawei',
       'Brand_InFocus', 'Brand_Infinix', 'Brand_Intex', 'Brand_Itel',
       'Brand_Jivi', 'Brand_Karbonn', 'Brand_LG', 'Brand_Lava', 'Brand_LeEco',
       'Brand_Lenovo', 'Brand_Lephone', 'Brand_Lyf', 'Brand_Meizu',
       'Brand_Micromax', 'Brand_Mobiistar', 'Brand_Moto', 'Brand_Motorola',
       'Brand_Nokia', 'Brand_Nubia', 'Brand_OPPO', 'Brand_OnePlus',
       'Brand_Oppo', 'Brand_Panasonic', 'Brand_Razer', 'Brand_Realme',
       'Brand_Reliance', 'Brand_Samsung', 'Brand_Sony', 'Brand_Spice',
       'Brand_Tecno', 'Brand_Ulefone', 'Brand_VOTO', 'Brand_Vivo',
       'Bran

In [124]:
train_new.head()

Unnamed: 0,PhoneId,Screen to Body Ratio (calculated),Resolution,Internal Memory,Height,Pixel Density,Weight,RAM,Processor_frequency,Screen Size,...,os_name_iOS,Num_cores_312,Num_cores_Deca,Num_cores_Dual,Num_cores_Hexa,Num_cores_Octa,Num_cores_Other,Num_cores_Quad,Num_cores_Tru-Octa,Rating
0,0,80.68,20,64,157.9,403,182,4,1.8,6.26,...,0,0,0,0,0,1,0,0,0,4.5
1,1,80.85,8,32,156.2,271,168,3,1.8,6.2,...,0,0,0,0,0,1,0,0,0,4.5
2,2,83.68,25,32,157.0,409,168,3,2.1,6.3,...,0,0,0,0,0,1,0,0,0,4.4
3,4,74.78,24,64,159.8,411,169,4,2.2,6.0,...,0,0,0,0,0,1,0,0,0,4.3
4,5,84.23,16,64,160.4,396,175,4,2.2,6.5,...,0,0,0,0,0,1,0,0,0,4.4


In [125]:
test_new.head()

Unnamed: 0,PhoneId,Screen to Body Ratio (calculated),Resolution,Internal Memory,Height,Pixel Density,Weight,RAM,Processor_frequency,Screen Size,...,os_name_Tizen,os_name_iOS,Num_cores_312,Num_cores_Deca,Num_cores_Dual,Num_cores_Hexa,Num_cores_Octa,Num_cores_Other,Num_cores_Quad,Num_cores_Tru-Octa
0,3,80.85,5,16,156.2,271,168,2,1.8,6.2,...,0,0,0,0,0,0,1,0,0,0
1,11,81.6,12,64,156.0,402,205,4,1.8,6.2,...,0,0,0,0,0,0,1,0,0,0
2,13,83.84,25,64,156.7,409,169,6,2.0,6.3,...,0,0,0,0,0,0,1,0,0,0
3,16,83.84,16,64,156.7,409,169,4,2.0,6.3,...,0,0,0,0,0,0,1,0,0,0
4,19,77.43,20,64,158.6,403,181,4,1.8,5.99,...,0,0,0,0,0,0,1,0,0,0


In [126]:
train_new.describe()

Unnamed: 0,PhoneId,Screen to Body Ratio (calculated),Resolution,Internal Memory,Height,Pixel Density,Weight,RAM,Processor_frequency,Screen Size,...,os_name_iOS,Num_cores_312,Num_cores_Deca,Num_cores_Dual,Num_cores_Hexa,Num_cores_Octa,Num_cores_Other,Num_cores_Quad,Num_cores_Tru-Octa,Rating
count,341.0,341.0,341.0,341.0,341.0,341.0,341.0,341.0,341.0,341.0,...,341.0,341.0,341.0,341.0,341.0,341.0,341.0,341.0,341.0,341.0
mean,232.970674,72.353343,9.592375,46.369501,149.946394,344.211144,161.343109,11.120235,1.792551,5.462317,...,0.055718,0.002933,0.005865,0.029326,0.035191,0.580645,0.017595,0.325513,0.002933,4.072434
std,137.231746,9.357839,5.953827,46.435453,9.192305,88.161129,22.305556,61.320855,0.451598,0.742529,...,0.229714,0.054153,0.076471,0.168965,0.184532,0.494179,0.131668,0.469255,0.054153,0.377066
min,0.0,23.6,2.0,0.0,115.6,143.0,58.0,1.0,1.0,2.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5
25%,110.0,68.8,5.0,16.0,146.2,282.0,150.0,2.0,1.4,5.2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.9
50%,229.0,72.5,8.0,32.0,151.3,326.0,161.0,3.0,1.8,5.5,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,4.1
75%,354.0,77.48,13.0,64.0,156.2,403.0,172.0,4.0,2.2,6.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,4.3
max,472.0,87.66,25.0,512.0,174.1,576.0,330.0,512.0,2.8,6.59,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,5.0


In [127]:
train_new['liked'] = train_new['Rating']>=4.0

In [128]:
train_new['liked'].value_counts()

True     238
False    103
Name: liked, dtype: int64

## Dummy Solution

In [129]:
submission = pd.DataFrame({'PhoneId':test_new['PhoneId'], 'Class':[1]*test_new.shape[0]})
submission = submission[['PhoneId', 'Class']]
submission.head()

Unnamed: 0,PhoneId,Class
0,3,1
1,11,1
2,13,1
3,16,1
4,19,1


In [39]:
submission.to_csv("submission.csv", index=False)

In [130]:
class MPNeuron:
    
    def __init__(self):
        self.b = None

    def model(self,x):
        return np.sum(x)>=self.b
  
    def predict(self,X):
        y_pred = []
        for x in X:
            result = self.model(x)
            y_pred.append(result)
        return np.array(y_pred)

    def fit(self,X,Y):
        accuracy = {}
        for b in range(X.shape[1]):
            self.b = b
            accuracy[b] = accuracy_score(self.predict(X),Y)
        
        best_b = max(accuracy, key = accuracy.get)
        self.b = best_b

        print("Optimal b", best_b)
        print("Max accuracy", accuracy[best_b])

    def train_test_analyse(self,X_train,Y_train,X_test,Y_test,test_size=None):
        y_train_preds = []
        y_test_preds = []
        optimal_train_b = 0
        optimal_test_b = 0
        max_train_accuracy = 0
        max_test_accuracy = 0 
        train_accuracies = []
        test_accuracies = []
        for b in range(x_btrain.shape[1]):
            train_correct = 0
            test_correct = 0
        for x,y in zip(x_btrain,y_train):
            y_pred = (np.sum(x)>=b)
            y_train_preds.append(y_pred)
            if y_pred==y:
                train_correct+=1
                accuracy = train_correct/x_btrain.shape[0]
                train_accuracies.append(accuracy)
            if accuracy>max_train_accuracy:
                max_train_accuracy = accuracy
                optimal_train_b = b

        for x,y in zip(x_btest,y_test):
            y_pred = (np.sum(x)>=b)
            y_test_preds.append(y_pred)
            if y_pred==y:
                test_correct+=1
                accuracy = test_correct/x_btest.shape[0]
                test_accuracies.append(accuracy)
            if accuracy>max_test_accuracy:
                max_test_accuracy = accuracy
                optimal_test_b = b

        plt.plot(train_accuracies)
        plt.plot(test_accuracies)
        plt.ylim([0,1])
        if test_size!=None:
              plt.savefig("Accuracy analysis for test size :"+str(test_size)+".png")
        plt.show()


In [131]:
mp = MPNeuron()

In [132]:
x = train_new.drop(['liked','Rating'],axis=1)
y = train_new['liked']

In [140]:
x.head()

Unnamed: 0,PhoneId,Screen to Body Ratio (calculated),Resolution,Internal Memory,Height,Pixel Density,Weight,RAM,Processor_frequency,Screen Size,...,os_name_Tizen,os_name_iOS,Num_cores_312,Num_cores_Deca,Num_cores_Dual,Num_cores_Hexa,Num_cores_Octa,Num_cores_Other,Num_cores_Quad,Num_cores_Tru-Octa
0,0,80.68,20,64,157.9,403,182,4,1.8,6.26,...,0,0,0,0,0,0,1,0,0,0
1,1,80.85,8,32,156.2,271,168,3,1.8,6.2,...,0,0,0,0,0,0,1,0,0,0
2,2,83.68,25,32,157.0,409,168,3,2.1,6.3,...,0,0,0,0,0,0,1,0,0,0
3,4,74.78,24,64,159.8,411,169,4,2.2,6.0,...,0,0,0,0,0,0,1,0,0,0
4,5,84.23,16,64,160.4,396,175,4,2.2,6.5,...,0,0,0,0,0,0,1,0,0,0


In [134]:
mean_df = train_new.groupby('liked').mean()

In [135]:
mean_df

Unnamed: 0_level_0,PhoneId,Screen to Body Ratio (calculated),Resolution,Internal Memory,Height,Pixel Density,Weight,RAM,Processor_frequency,Screen Size,...,os_name_iOS,Num_cores_312,Num_cores_Deca,Num_cores_Dual,Num_cores_Hexa,Num_cores_Octa,Num_cores_Other,Num_cores_Quad,Num_cores_Tru-Octa,Rating
liked,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
False,299.485437,69.088544,7.601942,30.330097,146.980823,327.116505,158.84466,27.543689,1.568447,5.163689,...,0.0,0.0,0.019417,0.038835,0.009709,0.436893,0.009709,0.485437,0.0,3.620388
True,204.184874,73.766261,10.453782,53.310924,151.229813,351.609244,162.42437,4.012605,1.889538,5.591555,...,0.079832,0.004202,0.0,0.02521,0.046218,0.642857,0.021008,0.256303,0.004202,4.268067


In [141]:
mean_df['PhoneId']

liked
False    299.485437
True     204.184874
Name: PhoneId, dtype: float64

In [None]:
#.map(lambda x: 1 if x<1000 else 0)

In [142]:
def bin(train,out,feature):
    
    mean_df = train.groupby(feature).mean()
    cols = list(out.columns)
    for col in cols:
        if mean_df[col][0]>mean_df[col][1]:
            out[col] = train[col].map(lambda x:1 if x<np.mean(np.array(train[col])) else 0)
        else:
            out[col] = train[col].map(lambda x:1 if x>np.mean(np.array(train[col])) else 0)
    return out

In [143]:
x = bin(train_new,x,'liked')

In [144]:
x

Unnamed: 0,PhoneId,Screen to Body Ratio (calculated),Resolution,Internal Memory,Height,Pixel Density,Weight,RAM,Processor_frequency,Screen Size,...,os_name_Tizen,os_name_iOS,Num_cores_312,Num_cores_Deca,Num_cores_Dual,Num_cores_Hexa,Num_cores_Octa,Num_cores_Other,Num_cores_Quad,Num_cores_Tru-Octa
0,1,1,1,1,1,1,1,1,1,1,...,1,0,0,1,1,0,1,0,1,0
1,1,1,0,0,1,0,1,1,1,1,...,1,0,0,1,1,0,1,0,1,0
2,1,1,1,0,1,1,1,1,1,1,...,1,0,0,1,1,0,1,0,1,0
3,1,1,1,1,1,1,1,1,1,1,...,1,0,0,1,1,0,1,0,1,0
4,1,1,1,1,1,1,1,1,1,1,...,1,0,0,1,1,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
336,0,0,0,0,0,0,0,1,0,0,...,1,0,0,1,1,0,0,0,0,0
337,0,0,1,1,1,1,1,1,1,1,...,1,0,0,1,1,0,1,0,1,0
338,0,0,0,0,0,0,0,1,0,0,...,1,0,0,1,1,0,1,0,1,0
339,0,0,0,0,0,0,0,1,0,0,...,1,0,0,1,1,0,0,0,0,0


In [67]:
x.to_csv("x.csv", index=False)

In [145]:
x['Num_cores_Quad'].value_counts()

1    230
0    111
Name: Num_cores_Quad, dtype: int64

In [146]:
x['os_name_iOS'].value_counts()

0    322
1     19
Name: os_name_iOS, dtype: int64

In [70]:
x = x.apply(pd.cut,bins=2,labels=[1,0])

In [71]:
x['Num_cores_Quad'].value_counts()

1    230
0    111
Name: Num_cores_Quad, dtype: int64

In [72]:
x['os_name_iOS'].value_counts()

1    322
0     19
Name: os_name_iOS, dtype: int64

In [147]:
x.head()

Unnamed: 0,PhoneId,Screen to Body Ratio (calculated),Resolution,Internal Memory,Height,Pixel Density,Weight,RAM,Processor_frequency,Screen Size,...,os_name_Tizen,os_name_iOS,Num_cores_312,Num_cores_Deca,Num_cores_Dual,Num_cores_Hexa,Num_cores_Octa,Num_cores_Other,Num_cores_Quad,Num_cores_Tru-Octa
0,1,1,1,1,1,1,1,1,1,1,...,1,0,0,1,1,0,1,0,1,0
1,1,1,0,0,1,0,1,1,1,1,...,1,0,0,1,1,0,1,0,1,0
2,1,1,1,0,1,1,1,1,1,1,...,1,0,0,1,1,0,1,0,1,0
3,1,1,1,1,1,1,1,1,1,1,...,1,0,0,1,1,0,1,0,1,0
4,1,1,1,1,1,1,1,1,1,1,...,1,0,0,1,1,0,1,0,1,0


In [148]:
x = x.values
y = y.values

In [149]:
mp.fit(x,y)

Optimal b 39
Max accuracy 0.7390029325513197
