In [500]:
import numpy as np 
import pandas as pd
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline 

In [501]:
sale =pd.read_csv('Date and model wise sale.csv')

# Data cleansing 

In [502]:
sale.head()

Unnamed: 0,Date,Model,Count
0,31-Aug-16,M45,5
1,31-Aug-16,M121,3
2,31-Aug-16,M122,4
3,31-Aug-16,M91,10
4,31-Aug-16,M66,57


In [503]:
print("Shape of sale dataset",sale.shape)

Shape of sale dataset (46116, 3)


In [504]:
sale.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46116 entries, 0 to 46115
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Date    46116 non-null  object
 1   Model   46116 non-null  object
 2   Count   46116 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.1+ MB


In [505]:
# number of unique values
sale.nunique()

Date      971
Model     124
Count    1191
dtype: int64

In [506]:
#checking for missing values
sale.isnull().sum()

Date     0
Model    0
Count    0
dtype: int64

In [507]:
sale.columns

Index(['Date', 'Model', 'Count'], dtype='object')

In [508]:
# Removing duplicate rows
sale=sale.drop_duplicates(keep='first')

In [509]:
sale.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 46116 entries, 0 to 46115
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Date    46116 non-null  object
 1   Model   46116 non-null  object
 2   Count   46116 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.4+ MB


In [510]:
sale.value_counts

<bound method DataFrame.value_counts of             Date Model  Count
0      31-Aug-16   M45      5
1      31-Aug-16  M121      3
2      31-Aug-16  M122      4
3      31-Aug-16   M91     10
4      31-Aug-16   M66     57
...          ...   ...    ...
46111  01-Jan-14   M74      1
46112  01-Jan-14   M59      1
46113  01-Jan-14   M60      5
46114  01-Jan-14   M54      1
46115  01-Jan-14   M58      1

[46116 rows x 3 columns]>

In [511]:
sale.shape

(46116, 3)

In [512]:
sale['Date'] = pd.to_datetime(sale['Date'], dayfirst=True)
sale['Date'] = sale['Date'].apply(lambda x: x.strftime('%Y-%m-%d'))
sale.head()

Unnamed: 0,Date,Model,Count
0,2016-08-31,M45,5
1,2016-08-31,M121,3
2,2016-08-31,M122,4
3,2016-08-31,M91,10
4,2016-08-31,M66,57


# Data visualize

In [513]:
import plotly.express as px
import chart_studio.plotly as py
fig = px.line(sale, x='Date', y='Count')

# Use date string to set xaxis range
fig.update_layout(xaxis_range=['2016-08-31','2014-01-1'],
                  title_text=" Sale on Date")
fig.show()
 

In [514]:
fig = px.scatter(sale, x='Date', y='Model',hover_name='Model')

# Use date string to set xaxis range
fig.update_layout(xaxis_range=['2016-08-31','2014-01-1'],
                  title_text=" Sale Model on Date")
fig.show()

In [515]:
from datetime import datetime
sale['Date'] = pd.to_datetime(sale['Date'], dayfirst=True)
sale['Date'] = sale['Date'].values.astype(np.int64) // 10 ** 9
sale.head(500)

Unnamed: 0,Date,Model,Count
0,1472601600,M45,5
1,1472601600,M121,3
2,1472601600,M122,4
3,1472601600,M91,10
4,1472601600,M66,57
...,...,...,...
495,1472169600,M71,7
496,1472169600,M120,1
497,1472169600,M43,3
498,1472169600,M13,15


In [516]:
from sklearn.preprocessing import OneHotEncoder

Ohe = OneHotEncoder(sparse=False)
Model=Ohe.fit_transform(sale[['Model']])
sale["Model"] = Model
Model
sale.head()

Unnamed: 0,Date,Model,Count
0,1472601600,0.0,5
1,1472601600,0.0,3
2,1472601600,0.0,4
3,1472601600,0.0,10
4,1472601600,0.0,57


# Data split 


In [517]:
 x = sale.drop(["Count"], axis = 1)
y=  sale["Count"]

In [518]:
##Data training
from sklearn.model_selection import train_test_split
X_train_org, X_test_org, y_train, y_test = train_test_split(x,y, test_size = 0.25, random_state = 0)

In [521]:
#feature Scalling 
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

X_train = scaler.fit_transform(X_train_org)
X_test  = scaler.transform(X_test_org)

In [522]:
 print("Size of training set: {}      size of test set: {}\n".format(X_train.shape[0], X_test.shape[0]))

Size of training set: 34587      size of test set: 11529



# Linear Regression

In [523]:
#for LINEAR Regression
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
linreg.fit(X_train,y_train)
print('Train score: %.4f'%linreg.score(X_train,y_train))


Train score: 0.0047


In [524]:
print('Test score: %.4f'%linreg.score(X_test,y_test))

Test score: 0.0033


In [532]:
 y_pred_lin = linreg.predict(X_train)
 print('predicted response:', y_pred_lin, sep='\n')
 print("Average predict response: {:.2f}".format(y_pred_lin.mean()))

predicted response:
[ 98.24170341  17.10741341  99.38802567 ...  65.70378699  62.08846294
 125.9297949 ]
Average predict response: 98.49


In [526]:
from sklearn.metrics import mean_squared_error
lin_train_mse = mean_squared_error( y_train,y_pred_lin )
print('Train mse :',lin_train_mse)


Train mse : 101025.88736760717


In [527]:
Lin_train_rmse = np.sqrt(lin_train_mse)
print( 'Train Rmse :' ,Lin_train_rmse)

Train Rmse : 317.84569741874304


In [529]:
 y_pred_test_lin = linreg.predict(X_test)
 print('test predicted response:', y_pred_test_lin, sep='\n')
 print("Average test predict response: {:.2f}".format(y_pred_test_lin.mean()))

test predicted response:
[127.07611716 116.31832365  95.33180845 ... 116.84739546 101.59249155
 106.00142332]
Average test predict response: 98.40


In [530]:
lin_test_mse = mean_squared_error( y_test,y_pred_test_lin )
print('Test mse : ',lin_test_mse)

Test mse :  92201.56820346983


In [531]:
Lin_test_rmse = np.sqrt(lin_test_mse)
print('Test Rmse : ' ,Lin_test_rmse)

Test Rmse :  303.6471113043393
