# 1.Preprocessing - Linear Regression project

### Notebook objective: clean both datasets in preparation for exploration

Datasets created from this notebook will be named with a leading 1 corresponding to this notebook.

In [4]:
import pandas as pd
import matplotlib as plt
import numpy as np
import os
import re
import fastparquet

In [5]:
cars = pd.read_csv('car_log.csv', na_values = '*')
people = pd.read_csv('ppl.csv')
display(cars[:4])
display(people[:4])

Unnamed: 0,Vehicle Name,Small/Sporty/ Compact/Large Sedan,Sports Car,SUV,Wagon,Minivan,Pickup,AWD,RWD,Retail Price,Dealer Cost,Engine Size (l),Cyl,HP,City MPG,Hwy MPG,Weight,Wheel Base,Len,Width
0,Acura 3.5 RL 4dr,1,0,0,0,0,0,0,0,43755,39014,3.5,6,225,18.0,24.0,3880.0,115.0,197.0,72.0
1,Acura 3.5 RL w/Navigation 4dr,1,0,0,0,0,0,0,0,46100,41100,3.5,6,225,18.0,24.0,3893.0,115.0,197.0,72.0
2,Acura MDX,0,0,1,0,0,0,1,0,36945,33337,3.5,6,265,17.0,23.0,4451.0,106.0,189.0,77.0
3,Acura NSX coupe 2dr manual S,0,1,0,0,0,0,0,1,89765,79978,3.2,6,290,17.0,24.0,3153.0,100.0,174.0,71.0


Unnamed: 0,id,age,salary,height,family_size,location,sex,car
0,1,18,32354,151,4,urban,male,Toyota Celica GT-S 2dr
1,2,57,32471,181,3,rural,female,Ford Crown Victoria 4dr
2,3,21,90452,151,1,rural,male,BMW Z4 convertible 2.5i 2dr
3,4,40,56945,163,1,urban,female,Volvo XC70


### Utility functions and classes

In [6]:
def cols_to_snake(df):
    ''' function to make all column headers snake case
    
    Input: pd dataframe
    Outputs: pd dataframe with clean headers
    
    '''
        
    df.columns = df.columns.astype("str")
    df.columns = df.columns.str.lower()
    df.columns = df.columns.map(lambda x : x.replace("-", "_").replace(" ", "_").replace("/", ""))
    return df

In [7]:
cars = cols_to_snake(cars)
people = cols_to_snake(people)

In [8]:
display(cars[:4])
display(people[:4])

Unnamed: 0,vehicle_name,smallsporty_compactlarge_sedan,sports_car,suv,wagon,minivan,pickup,awd,rwd,retail_price,dealer_cost,engine_size_(l),cyl,hp,city_mpg,hwy_mpg,weight,wheel_base,len,width
0,Acura 3.5 RL 4dr,1,0,0,0,0,0,0,0,43755,39014,3.5,6,225,18.0,24.0,3880.0,115.0,197.0,72.0
1,Acura 3.5 RL w/Navigation 4dr,1,0,0,0,0,0,0,0,46100,41100,3.5,6,225,18.0,24.0,3893.0,115.0,197.0,72.0
2,Acura MDX,0,0,1,0,0,0,1,0,36945,33337,3.5,6,265,17.0,23.0,4451.0,106.0,189.0,77.0
3,Acura NSX coupe 2dr manual S,0,1,0,0,0,0,0,1,89765,79978,3.2,6,290,17.0,24.0,3153.0,100.0,174.0,71.0


Unnamed: 0,id,age,salary,height,family_size,location,sex,car
0,1,18,32354,151,4,urban,male,Toyota Celica GT-S 2dr
1,2,57,32471,181,3,rural,female,Ford Crown Victoria 4dr
2,3,21,90452,151,1,rural,male,BMW Z4 convertible 2.5i 2dr
3,4,40,56945,163,1,urban,female,Volvo XC70


In [9]:
people.dtypes # all datatypes are ok

id              int64
age             int64
salary          int64
height          int64
family_size     int64
location       object
sex            object
car            object
dtype: object

In [10]:
cars.dtypes

vehicle_name                       object
smallsporty_compactlarge_sedan      int64
sports_car                          int64
suv                                 int64
wagon                               int64
minivan                             int64
pickup                              int64
awd                                 int64
rwd                                 int64
retail_price                        int64
dealer_cost                         int64
engine_size_(l)                   float64
cyl                                 int64
hp                                  int64
city_mpg                          float64
hwy_mpg                           float64
weight                            float64
wheel_base                        float64
len                               float64
width                             float64
dtype: object

In [11]:
cars.to_parquet('1-cars.parquet')
people.to_parquet('1-people.parquet')