In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split # to split a dataset into two parts: training and testing
from sklearn.preprocessing import StandardScaler, Normalizer, OneHotEncoder, LabelEncoder

In [2]:
df= pd.read_csv('../data/avocado.csv') # Importing the dataset
df.drop(['Unnamed: 0'], axis=1,inplace=True) #dropping the Unnamed: 0 column
df.head()

Unnamed: 0,Date,AveragePrice,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type,year,region
0,2015-12-27,1.33,64236.62,1036.74,54454.85,48.16,8696.87,8603.62,93.25,0.0,conventional,2015,Albany
1,2015-12-20,1.35,54876.98,674.28,44638.81,58.33,9505.56,9408.07,97.49,0.0,conventional,2015,Albany
2,2015-12-13,0.93,118220.22,794.7,109149.67,130.5,8145.35,8042.21,103.14,0.0,conventional,2015,Albany
3,2015-12-06,1.08,78992.15,1132.0,71976.41,72.58,5811.16,5677.4,133.76,0.0,conventional,2015,Albany
4,2015-11-29,1.28,51039.6,941.48,43838.39,75.78,6183.95,5986.26,197.69,0.0,conventional,2015,Albany


In [3]:
df.isnull().any(axis=1).sum() # To check for null values in the dataset

0

In [4]:
X = df.drop('AveragePrice', axis=1)
y = df['AveragePrice']

In [5]:
X["Date"] = pd.to_datetime(X["Date"])
#remove date and concat month and day and year as an int
X["Day-Month-Year"] = X["Date"].dt.year * 10000 + X["Date"].dt.month * 100 + X["Date"].dt.day
X.drop(["Date"], axis=1, inplace=True)
X.drop("year", axis=1, inplace=True)

In [6]:
X.head()

Unnamed: 0,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,type,region,Day-Month-Year
0,64236.62,1036.74,54454.85,48.16,8696.87,8603.62,93.25,0.0,conventional,Albany,20151227
1,54876.98,674.28,44638.81,58.33,9505.56,9408.07,97.49,0.0,conventional,Albany,20151220
2,118220.22,794.7,109149.67,130.5,8145.35,8042.21,103.14,0.0,conventional,Albany,20151213
3,78992.15,1132.0,71976.41,72.58,5811.16,5677.4,133.76,0.0,conventional,Albany,20151206
4,51039.6,941.48,43838.39,75.78,6183.95,5986.26,197.69,0.0,conventional,Albany,20151129


In [7]:
y.head()

0    1.33
1    1.35
2    0.93
3    1.08
4    1.28
Name: AveragePrice, dtype: float64

In [8]:
le = LabelEncoder()
X_le = X.copy()
X_le["type"]= le.fit_transform(X["type"])
X_le["region"] = le.fit_transform(X["region"])

In [25]:
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[8,9])],remainder='passthrough')
X_ct=pd.DataFrame(ct.fit_transform(X).toarray())

enc_col_names = ct.named_transformers_["encoder"].get_feature_names_out()
col_names = list(enc_col_names) + list(X.columns.drop(X.columns[[8,9]]))
X_ct.columns = col_names

In [26]:
X_ct.head()

Unnamed: 0,type_conventional,type_organic,region_Albany,region_Atlanta,region_BaltimoreWashington,region_Boise,region_Boston,region_BuffaloRochester,region_California,region_Charlotte,...,region_WestTexNewMexico,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,Day-Month-Year
0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,64236.62,1036.74,54454.85,48.16,8696.87,8603.62,93.25,0.0,20151227.0
1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,54876.98,674.28,44638.81,58.33,9505.56,9408.07,97.49,0.0,20151220.0
2,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,118220.22,794.7,109149.67,130.5,8145.35,8042.21,103.14,0.0,20151213.0
3,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,78992.15,1132.0,71976.41,72.58,5811.16,5677.4,133.76,0.0,20151206.0
4,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,51039.6,941.48,43838.39,75.78,6183.95,5986.26,197.69,0.0,20151129.0


In [27]:
X_train, X_test, Y_train, Y_test = train_test_split(X_ct,y,test_size=0.2, random_state=1)

In [28]:
X_test

Unnamed: 0,type_conventional,type_organic,region_Albany,region_Atlanta,region_BaltimoreWashington,region_Boise,region_Boston,region_BuffaloRochester,region_California,region_Charlotte,...,region_WestTexNewMexico,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,Day-Month-Year
14192,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,30480.39,1924.24,25560.94,35.13,2960.08,363.33,2596.75,0.00,20160717.0
1873,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,416298.84,82416.56,134956.77,13276.06,185649.45,185479.46,29.77,140.22,20151220.0
7293,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,529138.31,340439.27,66961.88,98.55,121638.61,67552.66,54085.95,0.00,20170507.0
15087,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,181725.85,32417.62,60616.15,0.00,88692.08,82117.33,6574.75,0.00,20170618.0
9888,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1377.24,7.07,1053.50,0.00,316.67,316.67,0.00,0.00,20150503.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4471,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,308711.73,201918.77,31729.56,234.28,74829.12,46089.44,28739.68,0.00,20160103.0
12556,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,10777.27,4802.52,397.26,0.00,5577.49,5384.85,192.64,0.00,20160103.0
9063,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,257250.20,98471.49,7427.50,582.75,150768.46,123950.80,26817.66,0.00,20180121.0
7497,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2223310.81,801224.73,664924.43,28132.18,729029.47,707685.17,6808.01,14536.29,20170702.0


In [29]:
X_train

Unnamed: 0,type_conventional,type_organic,region_Albany,region_Atlanta,region_BaltimoreWashington,region_Boise,region_Boston,region_BuffaloRochester,region_California,region_Charlotte,...,region_WestTexNewMexico,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,Day-Month-Year
8051,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,7103726.14,4335593.06,1264050.32,18959.07,1485123.69,1360058.70,110035.17,15029.82,20170115.0
17367,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3056.43,0.00,164.45,0.00,2891.98,1093.12,1798.86,0.00,20170611.0
12549,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,17232.43,7305.62,614.56,0.00,9312.25,6311.81,3000.44,0.00,20160221.0
17045,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,40058.59,1411.24,16587.80,24.73,22034.82,281.45,21753.37,0.00,20170709.0
6631,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,194103.11,8575.56,86633.97,752.93,98140.65,35477.95,62594.13,68.57,20171105.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10955,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,30717.28,3595.24,13212.07,311.49,13598.48,7405.56,6192.92,0.00,20151025.0
17289,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,6102.50,804.30,1175.92,0.00,4122.28,1492.22,2630.06,0.00,20171203.0
5192,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,5108381.41,2872173.10,1020821.92,423920.15,791466.24,658990.14,124051.73,8424.37,20160221.0
12172,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,18030.82,8.23,593.13,0.00,17429.46,15470.61,1958.85,0.00,20160522.0


In [30]:
Y_test

14192    2.11
1873     0.98
7293     1.17
15087    1.78
9888     1.61
         ... 
4471     1.11
12556    1.26
9063     1.05
7497     1.31
9151     2.04
Name: AveragePrice, Length: 3650, dtype: float64

In [31]:
Y_train

8051     0.68
17367    2.37
12549    1.18
17045    2.12
6631     1.26
         ... 
10955    1.64
17289    1.81
5192     0.82
12172    1.57
235      1.24
Name: AveragePrice, Length: 14599, dtype: float64

In [32]:
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)

In [33]:
X_train_std

array([[ 1.00294976, -1.00294976, -0.13932714, ...,  0.23781564,
         0.67671476,  0.85496156],
       [-0.99705892,  0.99705892, -0.13932714, ..., -0.21439719,
        -0.17335577,  0.90787124],
       [-0.99705892,  0.99705892, -0.13932714, ..., -0.20937697,
        -0.17335577, -0.20045859],
       ...,
       [ 1.00294976, -1.00294976, -0.13932714, ...,  0.29637703,
         0.30311758, -0.20045859],
       [-0.99705892,  0.99705892, -0.13932714, ..., -0.21372875,
        -0.17335577, -0.1683501 ],
       [ 1.00294976, -1.00294976, -0.13932714, ..., -0.22068731,
        -0.17335577, -1.22451696]])

In [34]:
X_test_std

array([[-9.97058918e-01,  9.97058918e-01, -1.39327141e-01, ...,
        -2.11063592e-01, -1.73355771e-01, -1.47548911e-01],
       [ 1.00294976e+00, -1.00294976e+00, -1.39327141e-01, ...,
        -2.21788470e-01, -1.65425078e-01, -1.16061998e+00],
       [ 1.00294976e+00, -1.00294976e+00, -1.39327141e-01, ...,
         4.05901237e-03, -1.73355771e-01,  8.96777275e-01],
       ...,
       [ 1.00294976e+00, -1.00294976e+00, -1.39327141e-01, ...,
        -1.09868288e-01, -1.73355771e-01,  1.92232906e+00],
       [ 1.00294976e+00, -1.00294976e+00, -1.39327141e-01, ...,
        -1.93468888e-01,  6.48801229e-01,  9.17578461e-01],
       [-9.97058918e-01,  9.97058918e-01,  7.17735247e+00, ...,
        -2.21912850e-01, -1.73355771e-01, -1.21555645e+00]])

In [35]:
nrm = Normalizer()
X_train_norm = nrm.fit_transform(X_train)
X_test_norm = nrm.fit_transform(X_test)

In [36]:
X_train_norm

array([[4.55601102e-08, 0.00000000e+00, 0.00000000e+00, ...,
        5.01321448e-03, 6.84760256e-04, 9.18952663e-01],
       [0.00000000e+00, 4.95770789e-08, 0.00000000e+00, ...,
        8.91822241e-05, 0.00000000e+00, 9.99999973e-01],
       [0.00000000e+00, 4.96026012e-08, 0.00000000e+00, ...,
        1.48829629e-04, 0.00000000e+00, 9.99999402e-01],
       ...,
       [4.75072848e-08, 0.00000000e+00, 0.00000000e+00, ...,
        5.89336086e-03, 4.00218945e-04, 9.57757360e-01],
       [0.00000000e+00, 4.96018370e-08, 0.00000000e+00, ...,
        9.71625585e-05, 0.00000000e+00, 9.99998927e-01],
       [4.95962741e-08, 0.00000000e+00, 0.00000000e+00, ...,
        1.45480751e-05, 0.00000000e+00, 9.99395722e-01]])

In [37]:
X_test_norm

array([[0.00000000e+00, 4.96013128e-08, 0.00000000e+00, ...,
        1.28802209e-04, 0.00000000e+00, 9.99998030e-01],
       [4.96084587e-08, 0.00000000e+00, 0.00000000e+00, ...,
        1.47684382e-06, 6.95609808e-06, 9.99670965e-01],
       [4.95516042e-08, 0.00000000e+00, 0.00000000e+00, ...,
        2.68004559e-03, 0.00000000e+00, 9.99480980e-01],
       ...,
       [4.95467371e-08, 0.00000000e+00, 0.00000000e+00, ...,
        1.32872755e-03, 0.00000000e+00, 9.99859149e-01],
       [4.91522005e-08, 0.00000000e+00, 0.00000000e+00, ...,
        3.34628673e-04, 7.14490641e-04, 9.91434389e-01],
       [0.00000000e+00, 4.96260549e-08, 4.96260549e-08, ...,
        0.00000000e+00, 0.00000000e+00, 9.99999992e-01]])

In [38]:
df_std = pd.DataFrame(X_train_std, columns=X_ct.columns)

In [39]:
df_std.describe()

Unnamed: 0,type_conventional,type_organic,region_Albany,region_Atlanta,region_BaltimoreWashington,region_Boise,region_Boston,region_BuffaloRochester,region_California,region_Charlotte,...,region_WestTexNewMexico,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,Day-Month-Year
count,14599.0,14599.0,14599.0,14599.0,14599.0,14599.0,14599.0,14599.0,14599.0,14599.0,...,14599.0,14599.0,14599.0,14599.0,14599.0,14599.0,14599.0,14599.0,14599.0,14599.0
mean,2.3361910000000003e-17,-2.3361910000000003e-17,1.971161e-17,2.5308740000000004e-17,-4.9644060000000006e-17,5.524118e-17,-2.6282150000000003e-17,6.181172000000001e-17,2.5308740000000004e-17,5.840477e-18,...,-1.3627780000000001e-17,-5.207759e-16,-5.558188e-16,2.4822030000000003e-17,5.519251e-16,-7.787303e-18,6.760352e-16,-2.153189e-15,-5.679864e-16,8.051487e-14
std,1.000034,1.000034,1.000034,1.000034,1.000034,1.000034,1.000034,1.000034,1.000034,1.000034,...,1.000034,1.000034,1.000034,1.000034,1.000034,1.000034,1.000034,1.000034,1.000034,1.000034
min,-0.9970589,-1.00295,-0.1393271,-0.1354453,-0.1351828,-0.1375282,-0.1364904,-0.1362298,-0.1370102,-0.1372695,...,-0.1343926,-0.2450824,-0.2303513,-0.2437441,-0.2125293,-0.2420296,-0.2434388,-0.2219128,-0.1733558,-1.279667
25%,-0.9970589,-1.00295,-0.1393271,-0.1354453,-0.1351828,-0.1375282,-0.1364904,-0.1362298,-0.1370102,-0.1372695,...,-0.1343926,-0.2419161,-0.2296705,-0.2412025,-0.2125293,-0.2367489,-0.2395271,-0.2213779,-0.1733558,-1.182168
50%,-0.9970589,0.9970589,-0.1393271,-0.1354453,-0.1351828,-0.1375282,-0.1364904,-0.1362298,-0.1370102,-0.1372695,...,-0.1343926,-0.2137607,-0.2234837,-0.2192526,-0.2107126,-0.2015663,-0.2085795,-0.2106682,-0.1733558,-0.1372017
75%,1.00295,0.9970589,-0.1393271,-0.1354453,-0.1351828,-0.1375282,-0.1364904,-0.1362298,-0.1370102,-0.1372695,...,-0.1343926,-0.119705,-0.1425711,-0.1177159,-0.1533113,-0.1278419,-0.1298887,-0.1306245,-0.1662084,0.9071245
max,1.00295,0.9970589,7.177352,7.383055,7.39739,7.271234,7.32652,7.340535,7.298724,7.284941,...,7.440887,18.1994,18.02908,17.11426,17.93126,19.76185,18.03932,23.67255,31.02985,1.94409


In [40]:
df_nrm = pd.DataFrame(X_train_norm, columns=X_ct.columns)

In [41]:
df_nrm.describe()

Unnamed: 0,type_conventional,type_organic,region_Albany,region_Atlanta,region_BaltimoreWashington,region_Boise,region_Boston,region_BuffaloRochester,region_California,region_Charlotte,...,region_WestTexNewMexico,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,Day-Month-Year
count,14599.0,14599.0,14599.0,14599.0,14599.0,14599.0,14599.0,14599.0,14599.0,14599.0,...,14599.0,14599.0,14599.0,14599.0,14599.0,14599.0,14599.0,14599.0,14599.0,14599.0
mean,2.434832e-08,2.487086e-08,9.444518e-10,8.932962e-10,8.896121e-10,9.206611e-10,9.068093e-10,9.036929e-10,8.876352e-10,9.1726e-10,...,8.79345e-10,0.03233,0.011048,0.011227,0.000863,0.009193,0.006991,0.002084,0.000118,0.992361
std,2.455276e-08,2.479857e-08,6.778896e-09,6.595482e-09,6.581035e-09,6.694573e-09,6.643986e-09,6.633817e-09,6.482608e-09,6.682415e-09,...,6.543334e-09,0.088599,0.034222,0.03168,0.003051,0.025471,0.019413,0.006837,0.00051,0.051781
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.265253
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000536,4.2e-05,0.000148,0.0,0.000254,0.000142,6e-06,0.0,0.999663
50%,0.0,4.940927e-08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.005269,0.000424,0.00143,9e-06,0.001942,0.001266,0.000133,0.0,0.999977
75%,4.957126e-08,4.960087e-08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.021067,0.005412,0.007356,0.000306,0.005484,0.004121,0.001081,6e-06,1.0
max,4.962737e-08,4.962754e-08,4.96275e-08,4.962752e-08,4.96275e-08,4.962754e-08,4.962753e-08,4.962754e-08,4.962617e-08,4.962753e-08,...,4.962753e-08,0.82284,0.348988,0.36017,0.04079,0.295522,0.226047,0.091191,0.01082,1.0


In [44]:
df_nrm["AveragePrice"] = y

In [45]:
df_nrm

Unnamed: 0,type_conventional,type_organic,region_Albany,region_Atlanta,region_BaltimoreWashington,region_Boise,region_Boston,region_BuffaloRochester,region_California,region_Charlotte,...,Total Volume,4046,4225,4770,Total Bags,Small Bags,Large Bags,XLarge Bags,Day-Month-Year,AveragePrice
0,4.556011e-08,0.000000e+00,0.0,0.0,0.0,0.0,0.000000e+00,0.0,0.0,0.0,...,0.323647,1.975301e-01,0.057590,0.000864,0.067662,0.061964,0.005013,0.000685,0.918953,1.33
1,0.000000e+00,4.957708e-08,0.0,0.0,0.0,0.0,0.000000e+00,0.0,0.0,0.0,...,0.000152,0.000000e+00,0.000008,0.000000,0.000143,0.000054,0.000089,0.000000,1.000000,1.35
2,0.000000e+00,4.960260e-08,0.0,0.0,0.0,0.0,0.000000e+00,0.0,0.0,0.0,...,0.000855,3.623778e-04,0.000030,0.000000,0.000462,0.000313,0.000149,0.000000,0.999999,0.93
3,0.000000e+00,4.957667e-08,0.0,0.0,0.0,0.0,0.000000e+00,0.0,0.0,0.0,...,0.001986,6.996457e-05,0.000822,0.000001,0.001092,0.000014,0.001078,0.000000,0.999997,1.08
4,4.957221e-08,0.000000e+00,0.0,0.0,0.0,0.0,0.000000e+00,0.0,0.0,0.0,...,0.009622,4.251094e-04,0.004295,0.000037,0.004865,0.001759,0.003103,0.000003,0.999926,1.28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14594,0.000000e+00,4.962518e-08,0.0,0.0,0.0,0.0,0.000000e+00,0.0,0.0,0.0,...,0.001524,1.784144e-04,0.000656,0.000015,0.000675,0.000368,0.000307,0.000000,0.999998,1.63
14595,0.000000e+00,4.957562e-08,0.0,0.0,0.0,0.0,0.000000e+00,0.0,0.0,0.0,...,0.000303,3.987367e-05,0.000058,0.000000,0.000204,0.000074,0.000130,0.000000,1.000000,1.50
14596,4.750728e-08,0.000000e+00,0.0,0.0,0.0,0.0,0.000000e+00,0.0,0.0,0.0,...,0.242685,1.364491e-01,0.048496,0.020139,0.037600,0.031307,0.005893,0.000400,0.957757,1.52
14597,0.000000e+00,4.960184e-08,0.0,0.0,0.0,0.0,4.960184e-08,0.0,0.0,0.0,...,0.000894,4.082231e-07,0.000029,0.000000,0.000865,0.000767,0.000097,0.000000,0.999999,1.71


In [27]:
df_nrm.to_csv("avocado_nrm.csv", index=False)