# Recell Linear Regression

### Imports

In [1]:
# Libraries to help with reading and manipulating data
import numpy as np
import pandas as pd

# Libraries to help with data visualization
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

# split the data into train and test
from sklearn.model_selection import train_test_split

# to build linear regression_model
from sklearn.linear_model import LinearRegression

# to check model performance
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# to build linear regression_model using statsmodels
import statsmodels.api as sm

# to compute VIF
from statsmodels.stats.outliers_influence import variance_inflation_factor

### Acquisition
#### Acquire data
#### Summarize Data
- .head
- .shape
- .describe
- .info

In [2]:
# acquire data
data = pd.read_csv('used_device_data.csv') 

In [3]:
#first glimpses of data
data.head()

Unnamed: 0,brand_name,os,screen_size,4g,5g,main_camera_mp,selfie_camera_mp,int_memory,ram,battery,weight,release_year,days_used,normalized_used_price,normalized_new_price
0,Honor,Android,14.5,yes,no,13.0,5.0,64.0,3.0,3020.0,146.0,2020,127,4.307572,4.7151
1,Honor,Android,17.3,yes,yes,13.0,16.0,128.0,8.0,4300.0,213.0,2020,325,5.162097,5.519018
2,Honor,Android,16.69,yes,yes,13.0,8.0,128.0,8.0,4200.0,213.0,2020,162,5.111084,5.884631
3,Honor,Android,25.5,yes,yes,13.0,8.0,64.0,6.0,7250.0,480.0,2020,345,5.135387,5.630961
4,Honor,Android,15.32,yes,no,13.0,8.0,64.0,3.0,5000.0,185.0,2020,293,4.389995,4.947837


In [4]:
#How many rows and columns?
data.shape

(3454, 15)

In [5]:
# This gives us the statistical summary of thw numerical columns
data.describe()

Unnamed: 0,screen_size,main_camera_mp,selfie_camera_mp,int_memory,ram,battery,weight,release_year,days_used,normalized_used_price,normalized_new_price
count,3454.0,3275.0,3452.0,3450.0,3450.0,3448.0,3447.0,3454.0,3454.0,3454.0,3454.0
mean,13.713115,9.460208,6.554229,54.573099,4.036122,3133.402697,182.751871,2015.965258,674.869716,4.364712,5.233107
std,3.80528,4.815461,6.970372,84.972371,1.365105,1299.682844,88.413228,2.298455,248.580166,0.588914,0.683637
min,5.08,0.08,0.0,0.01,0.02,500.0,69.0,2013.0,91.0,1.536867,2.901422
25%,12.7,5.0,2.0,16.0,4.0,2100.0,142.0,2014.0,533.5,4.033931,4.790342
50%,12.83,8.0,5.0,32.0,4.0,3000.0,160.0,2015.5,690.5,4.405133,5.245892
75%,15.34,13.0,8.0,64.0,4.0,4000.0,185.0,2018.0,868.75,4.7557,5.673718
max,30.71,48.0,32.0,1024.0,12.0,9720.0,855.0,2020.0,1094.0,6.619433,7.847841


In [6]:
#Here we learn the column names, if there are nulls, and the data type of each column
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3454 entries, 0 to 3453
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   brand_name             3454 non-null   object 
 1   os                     3454 non-null   object 
 2   screen_size            3454 non-null   float64
 3   4g                     3454 non-null   object 
 4   5g                     3454 non-null   object 
 5   main_camera_mp         3275 non-null   float64
 6   selfie_camera_mp       3452 non-null   float64
 7   int_memory             3450 non-null   float64
 8   ram                    3450 non-null   float64
 9   battery                3448 non-null   float64
 10  weight                 3447 non-null   float64
 11  release_year           3454 non-null   int64  
 12  days_used              3454 non-null   int64  
 13  normalized_used_price  3454 non-null   float64
 14  normalized_new_price   3454 non-null   float64
dtypes: f

### Acquire and Summarize Takeaways
#### 

In [7]:
# make a copy of data,creating a copy of the data so that original data remains unchanged
df = data.copy()

### Preparation
#### Clean data by converting datatypes and handling missing values
#### View single variable distributuionsbefore splitting data
####  Split data into 3 samples: Train, Validate, and Test using sklearn
#### Scale numeric data so all variables are on same scale

In [8]:
# find total of Null values in each column
df.isnull().sum()

brand_name                 0
os                         0
screen_size                0
4g                         0
5g                         0
main_camera_mp           179
selfie_camera_mp           2
int_memory                 4
ram                        4
battery                    6
weight                     7
release_year               0
days_used                  0
normalized_used_price      0
normalized_new_price       0
dtype: int64

In [34]:
df.columns[df.isnull().any()]

Index([], dtype='object')

#### Drop the nulls because the amount of data lost is only about 5%.

In [9]:
# drop Null values, assign to back to  df, and verify.
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3253 entries, 0 to 3453
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   brand_name             3253 non-null   object 
 1   os                     3253 non-null   object 
 2   screen_size            3253 non-null   float64
 3   4g                     3253 non-null   object 
 4   5g                     3253 non-null   object 
 5   main_camera_mp         3253 non-null   float64
 6   selfie_camera_mp       3253 non-null   float64
 7   int_memory             3253 non-null   float64
 8   ram                    3253 non-null   float64
 9   battery                3253 non-null   float64
 10  weight                 3253 non-null   float64
 11  release_year           3253 non-null   int64  
 12  days_used              3253 non-null   int64  
 13  normalized_used_price  3253 non-null   float64
 14  normalized_new_price   3253 non-null   float64
dtypes: f

In [None]:
df['price_diff']= df.normalized_new_price-df.normalized_used_price

In [12]:
df.brand_name.unique()

array(['Honor', 'Others', 'HTC', 'Huawei', 'Lava', 'Lenovo', 'LG',
       'Micromax', 'Nokia', 'Oppo', 'Samsung', 'Vivo', 'Xiaomi', 'ZTE',
       'Apple', 'Asus', 'Acer', 'Alcatel', 'BlackBerry', 'Celkon',
       'Coolpad', 'Gionee', 'Google', 'Karbonn', 'Meizu', 'Microsoft',
       'Motorola', 'OnePlus', 'Panasonic', 'Realme', 'Sony', 'Spice',
       'XOLO'], dtype=object)

In [13]:
df.os.unique()

array(['Android', 'Others', 'iOS', 'Windows'], dtype=object)

In [14]:
df.screen_size.unique()

array([14.5 , 17.3 , 16.69, 25.5 , 15.32, 16.23, 13.84, 15.77, 15.47,
       15.9 , 16.74, 25.43, 20.32, 15.29, 12.88, 15.24, 15.37, 16.71,
       25.6 , 15.34, 16.59, 16.81, 20.42, 10.16, 17.78, 12.7 , 15.44,
        5.28, 15.27, 15.8 , 15.72, 12.83,  7.62,  7.75, 10.03, 16.56,
        5.18, 15.39, 10.29, 16.43, 12.93, 15.42, 16.36, 16.21, 16.13,
       17.5 , 16.28, 14.4 , 15.21, 20.12, 16.48, 16.89, 16.31, 25.53,
       12.73, 20.55, 30.71, 27.94, 25.45, 18.01, 14.35, 23.04, 10.34,
       10.24, 12.78, 12.75, 23.01, 11.81, 11.84, 10.36, 11.76, 12.9 ,
       17.4 , 14.55, 12.8 ,  7.65, 10.21,  5.08,  5.13,  5.23, 20.35,
       12.57, 16.33, 18.08, 17.65, 14.83, 15.06, 13.08, 15.14, 30.56,
       19.96, 19.94, 25.4 , 11.89, 15.62, 17.83,  7.85,  7.67, 16.08,
       17.73, 12.52, 20.4 ,  7.82, 15.11, 13.87,  5.16, 10.08, 15.7 ,
       14.88, 15.82, 15.95,  7.7 , 14.53, 17.86, 12.85, 13.34, 11.48,
        7.98, 30.53,  7.8 , 12.12,  8.31,  6.68, 10.31, 11.56, 14.86,
       14.94, 13.79,

In [18]:
df['4g'].unique()

array(['yes', 'no'], dtype=object)

In [19]:
df['5g'].unique()

array(['no', 'yes'], dtype=object)

In [33]:
df.main_camera_mp.unique()

array([13.  ,  8.  ,  5.  , 10.5 ,  3.15,  2.  , 16.  ,  0.3 , 12.  ,
       14.5 , 48.  ,  3.  , 21.  ,  1.3 , 13.1 , 24.  ,  0.08, 20.7 ,
       23.  ,  1.  , 18.  , 12.2 , 12.3 , 20.  , 20.2 ,  4.  , 12.5 ,
       10.  ,  6.5 ,  6.7 , 41.  , 20.1 , 12.6 , 16.3 , 22.6 , 19.  ,
       21.5 , 21.2 ,  8.1 ,  1.2 , 22.5 ])

In [21]:
df.selfie_camera_mp.unique()

array([ 5.  , 16.  ,  8.  , 32.  ,  2.  ,  0.3 , 13.  , 14.5 , 24.  ,
       10.  ,  1.3 ,  0.  , 25.  , 20.  , 12.  ,  7.  ,  1.2 ,  3.  ,
        2.1 ,  1.  ,  4.  ,  1.1 , 10.5 ,  1.6 ,  1.8 ,  0.9 ,  1.25,
       18.  ,  1.9 ,  9.  ,  3.7 , 16.3 ,  5.1 ,  2.2 , 14.  ,  0.65])

In [22]:
df.int_memory.unique()

array([6.400e+01, 1.280e+02, 3.200e+01, 1.600e+01, 2.560e+02, 5.120e+02,
       8.000e+00, 4.000e+00, 5.000e-01, 2.000e-01, 1.000e-01, 6.000e-02,
       1.024e+03, 2.400e+01, 1.000e-02])

In [23]:
df.ram.unique()

array([ 3.  ,  8.  ,  6.  ,  4.  ,  2.  ,  1.  , 12.  ,  0.5 ,  0.25,
        0.03,  0.02,  1.5 ])

In [24]:
df.battery.unique()

array([3020. , 4300. , 4200. , 7250. , 5000. , 4000. , 3400. , 4100. ,
       3750. , 5100. , 5680. , 3850. , 3075. , 3300. , 3930. , 4500. ,
       7500. , 6100. , 3500. , 1700. , 3000. , 7000. , 3590. , 4050. ,
       8400. , 1470. , 3450. , 8200. , 1540. , 1500. , 2000. , 1020. ,
       1200. , 4025. , 4230. , 3600. , 4035. , 4020. , 4065. , 3130. ,
       3090. , 3820. , 6000. , 2700. , 4350. , 4030. , 3700. , 3260. ,
       3940. , 1900. , 2920. , 3200. , 2600. , 7040. , 7600. , 2800. ,
       9720. , 5770.5, 8827. , 7812. , 5124. , 3230. , 2200. , 1760. ,
       3100. , 3240. , 2840. , 1800. , 2460. , 4080. , 4420. , 2870. ,
       2420. , 5910. , 4550. , 2300. , 1300. , 2100. , 2500. , 4600. ,
       1630. , 2955. , 2400. , 7300. , 4960. , 2640. , 2710. , 2580. ,
       2850. , 2050. , 2150. , 2960. , 2610. , 2910. , 2820. , 1400. ,
       1000. , 4060. ,  750. ,  850. , 3050. , 5020. , 3150. , 2350. ,
       3140. , 1850. , 1350. , 1920. , 1420. , 1750. , 8000. , 4400. ,
      

In [25]:
df.weight.unique()

array([146.  , 213.  , 480.  , 185.  , 176.  , 144.  , 164.  , 165.  ,
       150.  , 206.  , 171.5 , 172.  , 173.  , 453.6 , 170.  , 160.  ,
       180.  , 192.  , 182.  , 163.  , 310.  , 189.  , 450.  , 226.  ,
       460.  , 183.  , 178.  , 198.  , 196.  , 163.5 , 498.  , 340.  ,
       320.  , 171.  , 196.8 , 136.1 , 350.  , 147.6 , 520.  , 580.  ,
       305.  , 236.  , 159.  , 655.  , 196.5 , 219.  , 127.  , 218.  ,
       148.  , 152.  , 186.  , 194.  , 175.  , 179.  , 140.45, 146.5 ,
        89.  ,  85.  , 200.  ,  90.5 , 220.  ,  88.2 , 181.  , 169.5 ,
       190.  , 215.  , 168.  , 135.  , 313.  , 141.  , 210.  , 206.5 ,
       203.  , 153.  , 234.  , 198.5 , 193.  , 186.7 , 191.5 , 190.5 ,
       218.5 , 217.3 , 217.  , 204.1 , 189.5 , 130.  , 145.  , 184.  ,
       241.  , 191.  , 173.8 , 157.  ,  91.3 ,  86.5 , 154.  , 187.  ,
       467.  , 168.1 , 420.  , 256.  , 653.  , 169.  , 338.  , 184.7 ,
       330.  , 191.6 , 242.  , 505.2 , 138.  , 176.6 , 355.  , 374.  ,
      

In [26]:
df.release_year.unique()

array([2020, 2019, 2013, 2014, 2016, 2018, 2015, 2017])

In [27]:
df.days_used.unique()

array([ 127,  325,  162,  345,  293,  223,  234,  219,  161,  327,  268,
        344,  537,  336,  230,  248,  395,  421,  532,  266,  321,  933,
        187,  499,  497,  202,  493,   91,  299,  354,  306,  231,  205,
        211,  120,  139,   93,  460,  256,  206,  488,  516,  289,  477,
        195,  244,  376,  406,  462,  333,  284,  445,  375,  666,  819,
        233,  419,  267,  461,  423,  528,  254,  370,  482,  691,  273,
        349,  343,  352,  142,   92,  166,  329,  383,  443,  257,  380,
        255,  512,  200,  210,  216,  724,  628,  750, 1016,  956,  680,
        733, 1029,  310,  193,  356,  272,  397,  411,  276,  392,  296,
        664, 1076,  317,  909,  135,  523,  414,  153,  125,  123,  220,
        173,  413,  279,  212,  188,  264,  259,  500,  288,  391,  204,
        456,  224,  451,  338,  455,  431,  334,  415,  931,  407,  470,
        280,  478,  382,  374,  396,  386,  347,  330,  474,  385,  506,
        595,  420,  138,  346,  362,  355,  141,  2

In [28]:
df.normalized_used_price.unique()

array([4.30757245, 5.16209665, 5.11108377, ..., 5.03773152, 4.35734965,
       4.34976167])

In [29]:
df.normalized_new_price.unique()

array([4.71510025, 5.51901794, 5.88463072, ..., 6.25153773, 4.62418819,
       4.27999391])

In [11]:
df.columns

Index(['brand_name', 'os', 'screen_size', '4g', '5g', 'main_camera_mp',
       'selfie_camera_mp', 'int_memory', 'ram', 'battery', 'weight',
       'release_year', 'days_used', 'normalized_used_price',
       'normalized_new_price'],
      dtype='object')

In [None]:
df['bath_bed_ratio'] = df.bathrooms / df.bedrooms

In [10]:
#check for duplicate values
data.duplicated().sum()

0

In [None]:
nums = ['fare']
cats = ['survived', 'pclass', 'sibsp', 'parch', 'alone', 'sex_male', 'embark_town_Queenstown', 'embark_town_Southampton']

In [None]:
for col in cats:
    print(col)
    print(train[col].value_counts())
    print(train[col].value_counts(normalize=True)*100)
    sns.countplot(x=col, data=train)
    plt.show()
    print('-------------')

In [None]:
for col in nums:
    sns.histplot(x=col, data=train)
    plt.show()

In [None]:
#### Visualize single variable distribution

In [None]:
It is important that data scaling happens after data splitting. We don't want to leak information from our test/validate splits by using those to calculate parameters for scaling.



### Exploration
#### Visualize
#### Hypothesize
#### Test Hypotheses

### Modeling
#### Feature Engineering
#### Establish Baseline
#### Build Models
#### Model Evaluation
#### Model Selction and Testing

### Remove duplicate rows

In [None]:
# The default is bins=10.

sns.displot(x='final_grade', data=df)

plt.title('final_grade')
plt.show()


In [None]:
Feature Scaling


In [None]:
Feature Engineering/Selection

In [None]:
data.duplicated().sum()

In [None]:
(df.drop_duplicates()).shape

In [None]:
df.head()

In [None]:
4g	5g yes no

In [None]:
df.release_year.min()

In [None]:
df.release_year.max()

In [None]:
df.brand_name.unique()

In [None]:
df.os.unique()

In [None]:
#gonna need to bin theses
df.screen_size.unique()

In [None]:
df.columns

In [None]:
df.main_camera_mp.min()

In [None]:
df.main_camera_mp.max()

In [None]:
df.main_camera_mp.unique()

In [None]:
df.selfie_camera_mp.min()

In [None]:
df.selfie_camera_mp.max()

In [None]:
df.selfie_camera_mp.unique()

In [None]:
df.int_memory.min()

In [None]:
df.int_memory.max()

In [None]:
df.ram.min()

In [None]:
df.ram.max()

In [None]:
df.battery.min()

In [None]:
df.battery.max()

In [None]:
df.weight.min()

In [None]:
df.weight.max()

In [None]:
df.days_used.min()

In [None]:
df.days_used.max()

In [None]:
'battery', 'weight',
       'release_year', 'days_used', 'normalized_used_price',
       'normalized_new_price'],

In [None]:
# drop columns
df = df.drop('col1', axis=1)

In [None]:
3454-179

In [None]:
3275/3454