In [125]:
import pandas as pd

## Explanation of Each Predictor
* **battery_power** - Total energy a battery can store in one time measured in mAh
* **blue**          - Whether the phone has bluetooth or not
* **clock_speed**   - Speed at which microprocessor executes instructions
* **dual_sim**      - Has dual sim support or not
* **fc**            - Front camera mega pixels
* **four_g**        - Whether the phone has 4G or not
* **int_memory**    - internal memory in gigabytes
* **m_dep**         - Mobile Depth in cm
* **mobile_wt**     - Weight of mobile phone
* **n_cores**       - Number of cores of a processor
* **pc**            - Primary Camera in mega pixels
* **px_height**     - Pixel Resolution Height
* **px_width**      - Pixel Resoulution Width
* **ram**           - Random Access Memory in Megabytes
* **sc_h**          - Screen Height of mobile in cm
* **sc_w**          - Screen Width of mobile in cm
* **talk_time**     - Longest time that a single battery charge will last when you are talking
* **three_g**       - Whether the phone has 3G or not
* **touch_screen**  - Whether the phone is touch screen or not
* **wifi**          - Whether the phone has wifi or not
* **price_range**   - Response variable; whether the phone is expensive or not

In [126]:
train_df = pd.read_csv("data/train.csv")

In [127]:
pd.set_option('display.max_columns', None)
train_df.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,2,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,6,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,6,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,9,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,14,1208,1212,1411,8,2,15,1,1,0,1


In [128]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 21 columns):
battery_power    2000 non-null int64
blue             2000 non-null int64
clock_speed      2000 non-null float64
dual_sim         2000 non-null int64
fc               2000 non-null int64
four_g           2000 non-null int64
int_memory       2000 non-null int64
m_dep            2000 non-null float64
mobile_wt        2000 non-null int64
n_cores          2000 non-null int64
pc               2000 non-null int64
px_height        2000 non-null int64
px_width         2000 non-null int64
ram              2000 non-null int64
sc_h             2000 non-null int64
sc_w             2000 non-null int64
talk_time        2000 non-null int64
three_g          2000 non-null int64
touch_screen     2000 non-null int64
wifi             2000 non-null int64
price_range      2000 non-null int64
dtypes: float64(2), int64(19)
memory usage: 328.2 KB


In [129]:
train_df.dtypes

battery_power      int64
blue               int64
clock_speed      float64
dual_sim           int64
fc                 int64
four_g             int64
int_memory         int64
m_dep            float64
mobile_wt          int64
n_cores            int64
pc                 int64
px_height          int64
px_width           int64
ram                int64
sc_h               int64
sc_w               int64
talk_time          int64
three_g            int64
touch_screen       int64
wifi               int64
price_range        int64
dtype: object

In [130]:
train_df.price_range.value_counts()

3    500
2    500
1    500
0    500
Name: price_range, dtype: int64

In [131]:
train_df.price_range.unique()

array([1, 2, 3, 0], dtype=int64)

In [132]:
print(train_df.isnull().sum())

battery_power    0
blue             0
clock_speed      0
dual_sim         0
fc               0
four_g           0
int_memory       0
m_dep            0
mobile_wt        0
n_cores          0
pc               0
px_height        0
px_width         0
ram              0
sc_h             0
sc_w             0
talk_time        0
three_g          0
touch_screen     0
wifi             0
price_range      0
dtype: int64


In [133]:
# Check to see if there are duplicated rows
sum(train_df.duplicated())

0

Immediately, I notice the 'four_g' and 'three_g' columns may be dependent on each other. All of the 4G phones are also labelled as 3G. Instead of leaving these two columns separate,we will combine them into a column named 'Data'. This will help alleviate dependency issues.

In [134]:
def data_check(row):
    if row['three_g'] == 0 and row['four_g'] == 1:
        print(row)

In [135]:
def data_update(row):
    if row['four_g'] + row['three_g'] == 0:
        return 'Regular'
    if row['four_g'] + row['three_g'] == 1:
        return '3G'
    if row['four_g'] + row['three_g'] == 2:
        return '4G'

In [136]:
train_df['Data'] = train_df.apply (lambda row: data_check(row), axis=1)

In [137]:
train_df['Data'] = train_df.apply (lambda row: data_update(row), axis=1)
train_df.drop(['three_g', 'four_g'], axis=1, inplace=True)

In [138]:
train_df.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,touch_screen,wifi,price_range,Data
0,842,0,2.2,0,1,7,0.6,188,2,2,20,756,2549,9,7,19,0,1,1,Regular
1,1021,1,0.5,1,0,53,0.7,136,3,6,905,1988,2631,17,3,7,1,0,2,4G
2,563,1,0.5,1,2,41,0.9,145,5,6,1263,1716,2603,11,2,9,1,0,2,4G
3,615,1,2.5,0,0,10,0.8,131,6,9,1216,1786,2769,16,8,11,0,0,2,3G
4,1821,1,1.2,0,13,44,0.6,141,2,14,1208,1212,1411,8,2,15,1,0,1,4G


### Looking at this, the following predictors will need to be converted to factors:
* Blue (binary)
* dual_sim (binary)
* four_g (binary)
* three_g (binary)
* touch_screen (binary)
* wifi (Binary)
* price_range (Response: categorical)  
<br>
<br>
Also, we notice that there are no missing values nor duplicate rows in this dataframe. 

In [139]:
cat_vars = ["blue", "dual_sim", "touch_screen", "wifi"]

In [140]:
def factor_create(dataset, col_arr, vals={}):
    """For Exploratory analysis, want to make the variables name show what they're representing """
    for col in col_arr:
        dataset[col] = dataset[col].astype('int')
        dataset[col].replace(vals, inplace=True)
        dataset[col] = dataset[col].astype('category')
        

In [141]:
factor_create(train_df, cat_vars, vals={0 : "No", 1 :"Yes"})

factor_create(train_df, ['price_range'], vals={0 : "Low", 1 : "Below Average", 
                                                      2 : "Above Average", 3 : "High"})

In [142]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 20 columns):
battery_power    2000 non-null int64
blue             2000 non-null category
clock_speed      2000 non-null float64
dual_sim         2000 non-null category
fc               2000 non-null int64
int_memory       2000 non-null int64
m_dep            2000 non-null float64
mobile_wt        2000 non-null int64
n_cores          2000 non-null int64
pc               2000 non-null int64
px_height        2000 non-null int64
px_width         2000 non-null int64
ram              2000 non-null int64
sc_h             2000 non-null int64
sc_w             2000 non-null int64
talk_time        2000 non-null int64
touch_screen     2000 non-null category
wifi             2000 non-null category
price_range      2000 non-null category
Data             2000 non-null object
dtypes: category(5), float64(2), int64(12), object(1)
memory usage: 236.7+ KB


In [143]:
train_df.to_csv('main', index=False)