In [66]:
import pandas as pd

pd.set_option('precision', 3)
pd.set_option('max_rows', 20)
pd.set_option('max_colwidth', 30)

%config Completer.use_jedi = False

## 1.  cleaning general-motors_carsales_src _dirty.csv


In [67]:
#read the GM file
gm = pd.read_csv("general-motors_carsales_src _dirty.csv", 
                                   encoding='ISO-8859-1')

In [68]:
#show null value
gm[gm.isnull().values==True]

Unnamed: 0,Year,Month,general-motors
1,2005.0,Feb,
26,2007.0,,333899
41,,Jun,253859
62,2010.0,,Mar187335


### fill the null value in 'Year'

In [69]:
# check the rows near the null value (index 41)
print(gm.iloc[35:45])

# from the result, we know the null value should be 2008
# fill in '2008'
gm['Year']=gm['Year'].fillna(2008)

# change the datatype of 'year' from float to intger 
gm['Year']=gm['Year'].astype(int)

       Year Month general-motors
35 2007.000   Dec         311382
36 2008.000   Jan         243955
37 2008.000   Feb         260800
38 2008.000   Mar         270650
39 2008.000   Apr         248746
40 2008.000   May         259587
41      NaN   Jun         253859
42 2008.000   Jul         230846
43 2008.000   Aug         303268
44 2008.000   Sep         279054


### fill the null value in 'Month'

In [70]:
# check the rows near the null value (index 26)
print(gm.iloc[20:30])

# from the result, we know the null value is 'Mar', so we fill 'Mar' into the cell
gm['Month'][26]='Mar'

    Year Month general-motors
20  2006   Sep         325729
21  2006   Oct         290221
22  2006   Nov         285560
23  2006   Dec         324413
24  2007   Jan         237351
25  2007   Feb         300624
26  2007   NaN         333899
27  2007   Apr         299061
28  2007   May         361590
29  2007   Jun         307784


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gm['Month'][26]='Mar'


### fix the null value in index 62

In [71]:
# check the row near index 62
print(gm.iloc[60:65])

# the data in column 'Month' is combined with column 'general-motors' 
# the string is split into two parts
gm['Month'][62]=gm['general-motors'][62][0:3]
gm['general-motors'][62]=gm['general-motors'][62][3:]

    Year Month general-motors
60  2010   Jan         145388
61  2010   Feb         140873
62  2010   NaN      Mar187335
63  2010   Apr         183005
64  2010   May         222841


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gm['Month'][62]=gm['general-motors'][62][0:3]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gm['general-motors'][62]=gm['general-motors'][62][3:]


In [72]:
# change the datatype of 'year' from object to intger 
gm['general-motors']=gm['general-motors'].astype(float)

### fix the null value in 'general-motors'

In [73]:
# list all value & mean of 2005
print(gm.loc[gm['Year']==2005])
print('*'*20)
mean_2005=gm.loc[gm['Year']==2005].mean()
print('The mean value of 2005:',mean_2005)
print('*'*20)

# the all value and mean of Feb from 2005-2015
print(gm.loc[gm['Month']=='Feb'])
print('*'*20)
mean_2005_2015_feb=gm['general-motors'].loc[gm['Year'] <= 2015].groupby([gm['Month']]).mean()['Feb']
print('The mean value of Feb from 2005-2010: ',mean_2005_2015_feb)

    Year Month  general-motors
0   2005   Jan      269125.000
1   2005   Feb             NaN
2   2005   Mar      409418.000
3   2005   Apr      368088.000
4   2005   May      372586.000
5   2005   Jun      533652.000
6   2005   Jul      505727.000
7   2005   Aug      338888.000
8   2005   Sep      336487.000
9   2005   Oct      245904.000
10  2005   Nov      269083.000
11  2005   Dec      375601.000
********************
The mean value of 2005: Year               2005.000
general-motors   365869.000
dtype: float64
********************
     Year Month  general-motors
1    2005   Feb             NaN
13   2006   Feb      288853.000
25   2007   Feb      300624.000
37   2008   Feb      260800.000
49   2009   Feb      124879.000
61   2010   Feb      140873.000
73   2011   Feb      206272.000
85   2012   Feb      417232.000
97   2013   Feb      222832.000
109  2014   Feb      221610.000
121  2015   Feb      230490.000
133  2016   Feb      226778.000
145  2017   Feb      236376.000
157  2018   

In [74]:
# fill the null value using mean of Feb 2005-2015
gm['general-motors'][1]=mean_2005_2015_feb

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gm['general-motors'][1]=mean_2005_2015_feb


In [75]:
#check null value again
print('Current number of null value in the dataset:', gm.isnull().any().sum())

Current number of null value in the dataset: 0


### fix the outlier value

In [76]:
# check total data
pd.set_option('display.float_format',lambda x : '%.3f' % x)
gm['general-motors'].describe()

count       204.000
mean     240065.858
std      113783.505
min           0.000
25%      206150.250
50%      241890.000
75%      268159.000
max     1395217.000
Name: general-motors, dtype: float64

In [77]:
# an extra large number is observed
max=gm['general-motors'].max()
print(gm.loc[gm['general-motors'] == max])

# '1' in string'1395217.00' is redundant, change to '395217'
gm['general-motors'][17]=395217

    Year Month  general-motors
17  2006   Jun     1395217.000


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gm['general-motors'][17]=395217


In [78]:
# another outlier is observed. The number should not have decimal places.
print(gm['general-motors'].nsmallest(15,keep='all'))

# it is believed that '189.561' is error, it should be '189561'
gm['general-motors'][52]=189561

195        0.000
196        0.000
197        0.000
198        0.000
199        0.000
200        0.000
201        0.000
202        0.000
203        0.000
52       189.561
183    88377.000
49    124879.000
48    126290.000
61    140873.000
60    145388.000
Name: general-motors, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gm['general-motors'][52]=189561


In [79]:
# check is any duplicated row =>0
gm[gm.duplicated()]

# remane the column name
gm=gm.rename(columns={'general-motors':'Gm'})

# change the datatype of 'Gm' to intger 
gm['Gm']=gm['Gm'].astype(int)

## 2.  cleaning ford-motor-company_carsales_src _dirty.csv

In [80]:
#read the Ford file
ford = pd.read_csv("ford-motor-company_carsales_src _dirty.csv", 
                                   encoding='ISO-8859-1')

In [81]:
#show null value
ford[ford.isnull().values==True]

Unnamed: 0,Year,Month,ford-motor-company
3,2005Apr260741,,
3,2005Apr260741,,
39,,,
39,,,
39,,,


### drop the row that all cells are null value

In [82]:
#drop the row that all values are null values
ford=ford.dropna(axis=0, how='all')

In [83]:
# check the left rows which contain null value
ford[ford.isnull().values==True]

Unnamed: 0,Year,Month,ford-motor-company
3,2005Apr260741,,
3,2005Apr260741,,


### fix the  rows with error

In [84]:
# fix the rows with error
ford['ford-motor-company'][3]=ford['Year'][3][7:]
ford['Month'][3]=ford['Year'][3][4:7]
ford['Year'][3]=ford['Year'][3][0:4]

#check if the error fix
ford.iloc[3:5]

Unnamed: 0,Year,Month,ford-motor-company
3,2005,Apr,260741.0
4,2005,May,263949.0


### drop the duplicated row

In [85]:
# check the duplicated rows
ford[ford.duplicated()]

Unnamed: 0,Year,Month,ford-motor-company
18,2006,Jun,246815.0


In [86]:
# drop the duplicated row
ford.drop_duplicates(inplace = True)

# check if duplicated row still exist
ford[ford.duplicated()]

Unnamed: 0,Year,Month,ford-motor-company


### sort the rows by month

In [87]:
# original row : ordered, changed row: Jan and Mar are exchanged
ford[24:36]

Unnamed: 0,Year,Month,ford-motor-company
25,2007,Mar,243541.0
26,2007,Feb,194310.0
27,2007,Jan,151416.0
28,2007,Apr,209694.0
29,2007,May,239579.0
30,2007,Jun,228376.0
31,2007,Jul,177167.0
32,2007,Aug,200401.0
33,2007,Sep,173554.0
34,2007,Oct,179652.0


In [88]:
# create a new column and sort by 'Year' and 'Month', then drop 'month_number'
ford['month_number']=ford['Month'].replace(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'],[1,2,3,4,5,6,7,8,9,10,11,12])
ford=ford.sort_values(by=['Year','month_number'])
ford=ford.drop(['month_number'],axis=1)
ford[24:36]

Unnamed: 0,Year,Month,ford-motor-company
27,2007,Jan,151416.0
26,2007,Feb,194310.0
25,2007,Mar,243541.0
28,2007,Apr,209694.0
29,2007,May,239579.0
30,2007,Jun,228376.0
31,2007,Jul,177167.0
32,2007,Aug,200401.0
33,2007,Sep,173554.0
34,2007,Oct,179652.0


In [89]:
# remane the column name
ford=ford.rename(columns={'ford-motor-company':'Ford'})

# change the datatype of 'Ford' to intger 
ford['Ford']=ford['Ford'].astype(int)

## 3. cleaning tesla-inc_carsales_src _dirty.csv

In [90]:
#read the Ford file
tesla = pd.read_csv("tesla-inc_carsales_src _dirty.csv", 
                                   encoding='ISO-8859-1')

In [91]:
#show null value
tesla[tesla.isnull().values==True]

Unnamed: 0,Year,Month,tesla-inc,Unnamed: 3
0,2015,Jan,0,
1,,2015,Feb,2000.000
2,2015,Mar,1200,
3,2015,Apr,1700,
4,2015,May,1700,
...,...,...,...,...
79,2021,Aug,0,
80,2021,Sep,0,
81,2021,Oct,0,
82,2021,Nov,0,


### fix the moved row

In [92]:
# it is observed that row (index 2) is shifted
tesla

Unnamed: 0,Year,Month,tesla-inc,Unnamed: 3
0,2015,Jan,0,
1,,2015,Feb,2000.000
2,2015,Mar,1200,
3,2015,Apr,1700,
4,2015,May,1700,
...,...,...,...,...
79,2021,Aug,0,
80,2021,Sep,0,
81,2021,Oct,0,
82,2021,Nov,0,


In [93]:
# fix the moved row
tesla['Year'][1]=tesla['Month'][1]
tesla['Month'][1]=tesla['tesla-inc'][1]
tesla['tesla-inc'][1]=tesla['Unnamed: 3'][1]
tesla=tesla.drop(['Unnamed: 3'],axis=1)
tesla.head(10)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tesla['Year'][1]=tesla['Month'][1]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tesla['Month'][1]=tesla['tesla-inc'][1]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tesla['tesla-inc'][1]=tesla['Unnamed: 3'][1]


Unnamed: 0,Year,Month,tesla-inc
0,2015,Jan,0.0
1,2015,Feb,2000.0
2,2015,Mar,1200.0
3,2015,Apr,1700.0
4,2015,May,1700.0
5,2015,Jun,1700.0
6,2015,Jul,1700.0
7,2015,Aug,1700.0
8,2015,Sep,1700.0
9,2015,Oct,1730.0


### fix the abnormal string

In [94]:
# show the abnormal string in 'Year', '20 18' is found
print(tesla['Year'].value_counts())
print('*'*20)
print(tesla.loc[tesla['Year']=='20 18'])

# replace the abnormal string
tesla['Year'][43]=tesla['Year'][43].replace(' ','')

2019     12
2021     12
2020     12
2017     12
2015     12
2016     12
2018     11
20 18     1
Name: Year, dtype: int64
********************
     Year Month tesla-inc
43  20 18   Aug     21700


In [95]:
# show the abnormal string in 'Month', 'Jan%' is found
print(tesla['Month'].value_counts())
print('*'*20)
print(tesla.loc[tesla['Month']=='Jan%'])

# replace the abnormal string
tesla['Month'][24]=tesla['Month'][24].replace('%','')

May     7
Sep     7
Dec     7
Oct     7
Mar     7
Apr     7
Aug     7
Jun     7
Nov     7
Feb     7
Jul     7
Jan     6
Jan%    1
Name: Month, dtype: int64
********************
    Year Month tesla-inc
24  2017  Jan%      2800


In [96]:
# remane the column name
tesla=tesla.rename(columns={'tesla-inc':'Tesla'})

# change the datatype of 'Tesla' to intger 
tesla['Tesla']=tesla['Tesla'].astype(int)

## merge three dataset

In [97]:
# create a new column which combines 'Year' and 'Month', and remove 'Year' and 'Month'
def process(name):
    name['Year']=name['Year'].astype(str)
    name['year_month'] = name['Year'] + "." + name['Month']
    name = name.drop(labels=['Year','Month'],axis=1)
    cols_name = name.columns.tolist()
    cols_name = cols_name[-1:] + cols_name[:-1]
    cols_name
    name = name[cols_name]
    return name

GM1=process(gm)
FORD1=process(ford)
TESLA1=process(tesla)

In [98]:
# merge the dataset
total = pd.merge(GM1,FORD1,how='left',on = 'year_month')
total1 = pd.merge(total,TESLA1,how='left',on = 'year_month')

# instead null value with 0 in total1['Tesla']
total1['Tesla']=total1['Tesla'].fillna(0)
total1['Tesla']=total1['Tesla'].astype(int)

# outpot file
total1.to_csv("carsales_stage.csv", index=False)

In [99]:
total1

Unnamed: 0,year_month,Gm,Ford,Tesla
0,2005.Jan,269125,183379,0
1,2005.Feb,241446,234208,0
2,2005.Mar,409418,281906,0
3,2005.Apr,368088,260741,0
4,2005.May,372586,263949,0
...,...,...,...,...
199,2021.Aug,0,0,0
200,2021.Sep,0,0,0
201,2021.Oct,0,0,0
202,2021.Nov,0,0,0
