In [1]:
# Importing Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import os
import sklearn
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# This option ensures that the graphs you create are displayed within the notebook without the need to "call" them specifically.

%matplotlib inline

In [3]:
path = r"C:\Users\harip\Dairy data Analysis"

In [4]:
# Importing the dataset
df= pd.read_csv(os.path.join(path, '02 Data', 'Prepared data', 'Dairy data cleaned.csv'), index_col = 0)

In [5]:
df.head()

Unnamed: 0,States,Total Land Area (acres),Number of Cows,Farm Size per sq.km,Date recording date,Product ID,Product Name,Brand,Quantity (liters/kg),Overall Prices,...,Price per Unit (sold),Approx. Total Revenue(INR),Customer Location,Sales categories,Quantity in Stock (liters/kg),Minimum Stock Threshold (liters/kg),Reorder Quantity (liters/kg),Year of recording data,Sales,Price flag
0,Telangana,310.84,96,Medium,17/02/2022,5,Ice Cream,Dodla Dairy,222.4,85.72,...,82.24,575.68,Madhya Pradesh,Wholesale,215,19.55,64.03,2022,Low Sales,Moderately priced
1,Uttar Pradesh,19.19,44,Large,01/12/2021,1,Milk,Amul,687.48,42.61,...,39.24,21895.92,Kerala,Wholesale,129,43.17,181.1,2021,Moderate Sales,Moderately priced
2,Tamil Nadu,581.69,24,Medium,28/02/2022,4,Yogurt,Dodla Dairy,503.48,36.5,...,33.81,8655.36,Madhya Pradesh,Online,247,15.1,140.83,2022,Low Sales,Moderately priced
3,Telangana,908.0,89,Small,09/06/2019,3,Cheese,Britannia Industries,823.36,26.52,...,28.92,17380.92,Rajasthan,Online,222,74.5,57.68,2019,Low Sales,Low priced
4,Maharashtra,861.95,21,Medium,14/12/2020,8,Buttermilk,Mother Dairy,147.77,83.85,...,83.07,12045.15,Jharkhand,Retail,2,76.02,33.4,2020,Low Sales,Moderately priced


# Data Cleaning

In [6]:
df.shape

(4325, 26)

In [7]:
# Check for missing values
df.isnull().sum()

States                                 0
Total Land Area (acres)                0
Number of Cows                         0
Farm Size per sq.km                    0
Date recording date                    0
Product ID                             0
Product Name                           0
Brand                                  0
Quantity (liters/kg)                   0
Overall Prices                         0
Total Value of products available      0
Shelf Life (days)                      0
Storage Condition                      0
Production Date                        0
Expiration Date                        0
Quantity Sold (liters/kg)              0
Price per Unit (sold)                  0
Approx. Total Revenue(INR)             0
Customer Location                      0
Sales categories                       0
Quantity in Stock (liters/kg)          0
Minimum Stock Threshold (liters/kg)    0
Reorder Quantity (liters/kg)           0
Year of recording data                 0
Sales           

No missing values seen

In [8]:
# Check for duplicates
dups = df.duplicated()

In [9]:
dups

0       False
1       False
2       False
3       False
4       False
        ...  
4320    False
4321    False
4322    False
4323    False
4324    False
Length: 4325, dtype: bool

No duplicates seen

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4325 entries, 0 to 4324
Data columns (total 26 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   States                               4325 non-null   object 
 1   Total Land Area (acres)              4325 non-null   float64
 2   Number of Cows                       4325 non-null   int64  
 3   Farm Size per sq.km                  4325 non-null   object 
 4   Date recording date                  4325 non-null   object 
 5   Product ID                           4325 non-null   int64  
 6   Product Name                         4325 non-null   object 
 7   Brand                                4325 non-null   object 
 8   Quantity (liters/kg)                 4325 non-null   float64
 9   Overall Prices                       4325 non-null   float64
 10  Total Value of products available    4325 non-null   float64
 11  Shelf Life (days)             

In [11]:
df.describe()

Unnamed: 0,Total Land Area (acres),Number of Cows,Product ID,Quantity (liters/kg),Overall Prices,Total Value of products available,Shelf Life (days),Quantity Sold (liters/kg),Price per Unit (sold),Approx. Total Revenue(INR),Quantity in Stock (liters/kg),Minimum Stock Threshold (liters/kg),Reorder Quantity (liters/kg),Year of recording data
count,4325.0,4325.0,4325.0,4325.0,4325.0,4325.0,4325.0,4325.0,4325.0,4325.0,4325.0,4325.0,4325.0,4325.0
mean,503.483073,54.963699,5.509595,500.652657,54.785938,27357.845411,29.12763,248.095029,54.77914,13580.265401,252.068671,55.826143,109.10782,2020.466358
std,285.935061,26.111487,2.842979,288.975915,26.002815,21621.051594,30.272114,217.024182,26.19279,14617.009122,223.62087,26.30145,51.501035,1.113822
min,10.17,10.0,1.0,1.17,10.03,42.5165,1.0,1.0,5.21,12.54,0.0,10.02,20.02,2019.0
25%,252.95,32.0,3.0,254.17,32.46,9946.8145,10.0,69.0,32.64,2916.65,66.0,32.91,64.28,2019.0
50%,509.17,55.0,6.0,497.55,54.4,21869.6529,22.0,189.0,54.14,8394.54,191.0,56.46,108.34,2020.0
75%,751.25,77.0,8.0,749.78,77.46,40954.441,30.0,374.0,77.46,19504.55,387.0,79.01,153.39,2021.0
max,999.53,100.0,10.0,999.93,99.99,99036.3696,150.0,960.0,104.51,89108.9,976.0,99.99,199.95,2022.0


# Data preparation for Regression analysis

### Model 1: Product name vs Quality sold

In [12]:
X = df['Product Name']

In [13]:
X = pd.get_dummies(data = X, drop_first = True)
X.head()

Unnamed: 0,Buttermilk,Cheese,Curd,Ghee,Ice Cream,Lassi,Milk,Paneer,Yogurt
0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,1
3,0,1,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0


In [14]:
y = df['Quantity Sold (liters/kg)']

In [15]:
y

0         7
1       558
2       256
3       601
4       145
       ... 
4320    352
4321     68
4322    141
4323      2
4324    417
Name: Quantity Sold (liters/kg), Length: 4325, dtype: int64

In [16]:
# Split data into a training set and a test set.  Using test_train_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3,random_state = 0)


#### Regression Analysis

In [17]:
# Create regression object
regression = LinearRegression()

In [18]:
# Fit the regression to the training set of each variable.  This trains the model.
regression.fit(X_train, y_train)

In [19]:
# Apply model to x_test data to predict y values
y_predicted = regression.predict(X_test)

In [20]:
# Check the slope, MSE and R2 score.
rmse = mean_squared_error(y_test, y_predicted)
r2 = r2_score(y_test, y_predicted)

In [21]:
# Print the model summary statistics. This is where you evaluate the performance of the model.

print('Slope:' ,regression.coef_)
print('Mean squared error: ', rmse)
print('R2 score: ', r2)

Slope: [-23.3121886  -12.95976266  -6.7019167  -13.3332811  -11.7527041
  -6.40997181  -7.40323408 -10.36577315  -4.462191  ]
Mean squared error:  43988.40269434572
R2 score:  -0.007286734329557509


From the summary statistics, the model is showing that these two variables are not a good fit as the slope, r2 values showing negative trend and it is confirmed by the higher rmse values.

#### training set

In [22]:
# Predict on the training set

y_predicted_train = regression.predict(X_train)

In [23]:
rmse = mean_squared_error(y_train, y_predicted_train)
r2 = r2_score(y_train, y_predicted_train)

In [24]:
print('Slope:' ,regression.coef_)
print('Mean squared error: ', rmse)
print('R2 score: ', r2)

Slope: [-23.3121886  -12.95976266  -6.7019167  -13.3332811  -11.7527041
  -6.40997181  -7.40323408 -10.36577315  -4.462191  ]
Mean squared error:  48429.77980957894
R2 score:  0.0007799558596361944


From the training set, the model outcome are almost similar to the test set. But could see a significant increase in r2 values from negative to positive but still remains so less indicating that the model is not a good fit.

## Model 2: Product names and Approx total revenue.

In [25]:
X

Unnamed: 0,Buttermilk,Cheese,Curd,Ghee,Ice Cream,Lassi,Milk,Paneer,Yogurt
0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,1
3,0,1,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
4320,0,0,1,0,0,0,0,0,0
4321,0,0,0,0,0,0,0,0,1
4322,0,0,1,0,0,0,0,0,0
4323,0,0,1,0,0,0,0,0,0


In [26]:
y1 = df['Approx. Total Revenue(INR)']

In [27]:
y1

0         575.68
1       21895.92
2        8655.36
3       17380.92
4       12045.15
          ...   
4320    30694.40
4321     3970.52
4322    12613.86
4323       21.12
4324    18577.35
Name: Approx. Total Revenue(INR), Length: 4325, dtype: float64

In [28]:
# Split data into a training set and a test set.  Using test_train_split
X_train,X_test,y1_train,y1_test = train_test_split(X,y1,test_size = 0.3,random_state = 0)


In [29]:
# Fit the regression to the training set of each variable.  This trains the model.
regression.fit(X_train, y1_train)

In [30]:
# Apply model to x_test data to predict y values
y1_predicted = regression.predict(X_test)

In [31]:
# Check the slope, MSE and R2 score.
rmse = mean_squared_error(y1_test, y1_predicted)
r2 = r2_score(y1_test, y1_predicted)

In [32]:
# Print the model summary statistics. This is where you evaluate the performance of the model.

print('Slope:' ,regression.coef_)
print('Mean squared error: ', rmse)
print('R2 score: ', r2)

Slope: [-1112.69095973 -1745.66342941  -459.19225883 -1794.30128773
 -2652.26637482 -1313.29338425  -726.07910461 -1182.7202613
 -1265.01966016]
Mean squared error:  187877755.8774392
R2 score:  -0.016554524081113353


## Testing with training set

In [33]:
# Predict on the training set

y1_predicted_train = regression.predict(X_train)

In [34]:
rmse = mean_squared_error(y1_train, y1_predicted_train)
r2 = r2_score(y_train, y_predicted_train)

In [35]:
print('Slope:' ,regression.coef_)
print('Mean squared error: ', rmse)
print('R2 score: ', r2)

Slope: [-1112.69095973 -1745.66342941  -459.19225883 -1794.30128773
 -2652.26637482 -1313.29338425  -726.07910461 -1182.7202613
 -1265.01966016]
Mean squared error:  224729006.6246692
R2 score:  0.0007799558596361944


From the test and training set, the model outcome is not a good fit seeing the r2 value which is comparably very less.

# Model 3: Product name and Minimum stock threshold

In [36]:
X

Unnamed: 0,Buttermilk,Cheese,Curd,Ghee,Ice Cream,Lassi,Milk,Paneer,Yogurt
0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,1
3,0,1,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
4320,0,0,1,0,0,0,0,0,0
4321,0,0,0,0,0,0,0,0,1
4322,0,0,1,0,0,0,0,0,0
4323,0,0,1,0,0,0,0,0,0


In [37]:
y2 = df['Minimum Stock Threshold (liters/kg)']

In [38]:
y2

0       19.55
1       43.17
2       15.10
3       74.50
4       76.02
        ...  
4320    98.07
4321    87.41
4322    33.47
4323    58.25
4324    22.34
Name: Minimum Stock Threshold (liters/kg), Length: 4325, dtype: float64

### Creating train and test set

In [39]:
from sklearn.model_selection import train_test_split
X_train, X_test, y2_train, y2_test = train_test_split(X, y2, test_size=0.4, random_state=101)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(2595, 9)
(1730, 9)
(3027,)
(1298,)


In [40]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train,y2_train)

In [41]:
# print the intercept
print(model.intercept_)

57.10462450592885


In [42]:
coeff_parameter = pd.DataFrame(model.coef_,X.columns,columns=['Coefficient'])
coeff_parameter

Unnamed: 0,Coefficient
Buttermilk,-1.97982
Cheese,-2.522649
Curd,0.364639
Ghee,-1.831991
Ice Cream,-1.155785
Lassi,1.956372
Milk,-1.202105
Paneer,-3.704469
Yogurt,-1.362352


In [43]:
predictions = model.predict(X_test)
predictions

array([55.12480469, 53.40015564, 55.74227273, ..., 57.10462451,
       59.06099644, 55.90251938])

In [44]:
import statsmodels.api as sm
X_train_Sm= sm.add_constant(X_train)
X_train_Sm= sm.add_constant(X_train)
ls=sm.OLS(y2_train,X_train_Sm).fit()
print(ls.summary())

                                     OLS Regression Results                                    
Dep. Variable:     Minimum Stock Threshold (liters/kg)   R-squared:                       0.003
Model:                                             OLS   Adj. R-squared:                 -0.000
Method:                                  Least Squares   F-statistic:                    0.9816
Date:                                 Sun, 09 Jul 2023   Prob (F-statistic):              0.453
Time:                                         23:07:40   Log-Likelihood:                -12167.
No. Observations:                                 2595   AIC:                         2.435e+04
Df Residuals:                                     2585   BIC:                         2.441e+04
Df Model:                                            9                                         
Covariance Type:                             nonrobust                                         
                 coef    std err        

When seeing the adjusted r2 values between the variables this model is not a good fit for the analysis.

# Model 4: Farm size and the total revenue


In [45]:
X1 = df['Farm Size per sq.km']

In [46]:
X1 = pd.get_dummies(data=X1, drop_first=True)
X1.head()

Unnamed: 0,Medium,Small
0,1,0
1,0,0
2,1,0
3,0,1
4,1,0


In [47]:
y1 = df['Approx. Total Revenue(INR)']

In [48]:
from sklearn.model_selection import train_test_split
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.4, random_state=101)
print(X1_train.shape)
print(X1_test.shape)
print(y1_train.shape)
print(y1_test.shape)

(2595, 2)
(1730, 2)
(2595,)
(1730,)


In [49]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X1_train,y1_train)

In [50]:
# print the intercept
print(model.intercept_)

13855.51893764434


In [51]:
coeff_parameter = pd.DataFrame(model.coef_,X1.columns,columns=['Coefficient'])
coeff_parameter

Unnamed: 0,Coefficient
Medium,-522.4284
Small,-378.855047


In [52]:
import statsmodels.api as sm
X1_train_Sm= sm.add_constant(X1_train)
X1_train_Sm= sm.add_constant(X1_train)
ls=sm.OLS(y1_train,X1_train_Sm).fit()
print(ls.summary())

                                OLS Regression Results                                
Dep. Variable:     Approx. Total Revenue(INR)   R-squared:                       0.000
Model:                                    OLS   Adj. R-squared:                 -0.001
Method:                         Least Squares   F-statistic:                    0.2976
Date:                        Sun, 09 Jul 2023   Prob (F-statistic):              0.743
Time:                                23:07:40   Log-Likelihood:                -28551.
No. Observations:                        2595   AIC:                         5.711e+04
Df Residuals:                            2592   BIC:                         5.713e+04
Df Model:                                   2                                         
Covariance Type:                    nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------

After seeing the adjusted r2 values the desired model has no fit between the variables.

# Model 5 : Storage condition and Quantity in stock

In [53]:
X3 = df['Storage Condition']

In [54]:
X3 = pd.get_dummies(data=X3, drop_first=True)
X3.head()


Unnamed: 0,Frozen,Polythene Packet,Refrigerated,Tetra Pack
0,1,0,0,0
1,0,0,0,1
2,0,0,1,0
3,1,0,0,0
4,0,0,1,0


In [55]:
y3 = df['Quantity in Stock (liters/kg)']

In [56]:
from sklearn.model_selection import train_test_split
X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, test_size=0.4, random_state=101)
print(X3_train.shape)
print(X3_test.shape)
print(y3_train.shape)
print(y3_test.shape)

(2595, 4)
(1730, 4)
(2595,)
(1730,)


In [57]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X3_train,y3_train)

In [58]:
# print the intercept
print(model.intercept_)

236.47325102880654


In [59]:
coeff_parameter = pd.DataFrame(model.coef_,X3.columns,columns=['Coefficient'])
coeff_parameter

Unnamed: 0,Coefficient
Frozen,12.73694
Polythene Packet,24.245059
Refrigerated,20.532888
Tetra Pack,0.862956


In [60]:
predictions = model.predict(X3_test)
predictions


array([257.00613915, 257.00613915, 249.21019108, ..., 257.00613915,
       257.00613915, 260.71830986])

In [61]:
import statsmodels.api as sm
X3_train_Sm= sm.add_constant(X3_train)
X3_train_Sm= sm.add_constant(X3_train)
ls=sm.OLS(y3_train,X3_train_Sm).fit()
print(ls.summary())

                                  OLS Regression Results                                 
Dep. Variable:     Quantity in Stock (liters/kg)   R-squared:                       0.001
Model:                                       OLS   Adj. R-squared:                 -0.001
Method:                            Least Squares   F-statistic:                    0.6612
Date:                           Sun, 09 Jul 2023   Prob (F-statistic):              0.619
Time:                                   23:07:40   Log-Likelihood:                -17747.
No. Observations:                           2595   AIC:                         3.550e+04
Df Residuals:                               2590   BIC:                         3.553e+04
Df Model:                                      4                                         
Covariance Type:                       nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
---------------

Despite having positive coefficient, after analysing the adjusted r2 values it clear that the storage conditions have no effect on the quantity in stock.

# Model 6 : The Sales channel and the total revenue

In [62]:
X4 = df['Sales categories']

In [63]:
X4 = pd.get_dummies(data=X4, drop_first=True)
X4.head()

Unnamed: 0,Retail,Wholesale
0,0,1
1,0,1
2,0,0
3,0,0
4,1,0


In [64]:
y1 = df['Approx. Total Revenue(INR)']

In [65]:
y1

0         575.68
1       21895.92
2        8655.36
3       17380.92
4       12045.15
          ...   
4320    30694.40
4321     3970.52
4322    12613.86
4323       21.12
4324    18577.35
Name: Approx. Total Revenue(INR), Length: 4325, dtype: float64

In [66]:
# Split data into a training set and a test set.  Using test_train_split
X4_train,X4_test,y1_train,y1_test = train_test_split(X4,y1,test_size = 0.3,random_state = 0)


### Test set

In [67]:
# Create regression object
regression = LinearRegression()

In [68]:
# Fit the regression to the training set of each variable.  This trains the model.
regression.fit(X4_train, y1_train)

In [69]:
# Apply model to x_test data to predict y values
y1_predicted = regression.predict(X4_test)

In [70]:
# Check the slope, MSE and R2 score.
rmse = mean_squared_error(y1_test, y1_predicted)
r2 = r2_score(y1_test, y1_predicted)

In [71]:
# Print the model summary statistics. This is where you evaluate the performance of the model.

print('Slope:' ,regression.coef_)
print('Mean squared error: ', rmse)
print('R2 score: ', r2)

Slope: [ 988.18903418 1060.51078563]
Mean squared error:  187061339.64639735
R2 score:  -0.012137121875499313


### Train set

In [72]:
# Predict on the training set

y_predicted_train = regression.predict(X4_train)

In [73]:
rmse = mean_squared_error(y1_train, y1_predicted_train)
r2 = r2_score(y1_train, y1_predicted_train)

In [74]:
print('Slope:' ,regression.coef_)
print('Mean squared error: ', rmse)
print('R2 score: ', r2)

Slope: [ 988.18903418 1060.51078563]
Mean squared error:  224729006.6246692
R2 score:  0.002208203723876978


Despite the slope values showing a positive relationship the r2 value in both the train and test set is very low indicating a poor fit model.

# Model 7 : Product name and Overall prices

In [75]:
X = df['Product Name']

In [76]:
X = pd.get_dummies(data=X, drop_first=True)
X.head()

Unnamed: 0,Buttermilk,Cheese,Curd,Ghee,Ice Cream,Lassi,Milk,Paneer,Yogurt
0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,1
3,0,1,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0


In [77]:
y4 = df['Overall Prices']

In [78]:
# Split data into a training set and a test set.  Using test_train_split
X_train,X_test,y4_train,y4_test = train_test_split(X,y4,test_size = 0.3,random_state = 0)


### Test set

In [79]:
# Fit the regression to the training set of each variable.  This trains the model.
regression.fit(X_train, y4_train)

In [80]:
# Apply model to x_test data to predict y values
y4_predicted = regression.predict(X_test)

In [81]:
# Check the slope, MSE and R2 score.
rmse = mean_squared_error(y4_test, y4_predicted)
r2 = r2_score(y4_test, y4_predicted)

In [82]:
# Print the model summary statistics. This is where you evaluate the performance of the model.

print('Slope:' ,regression.coef_)
print('Mean squared error: ', rmse)
print('R2 score: ', r2)

Slope: [-2.75084245 -4.44754771 -3.09848596 -3.51784896 -7.78889657 -3.67672705
 -1.40150217 -4.34786387 -4.3475566 ]
Mean squared error:  689.0738670369011
R2 score:  -0.009317570764701744


### Train set

In [83]:
# Predict on the training set

y4_predicted_train = regression.predict(X_train)

In [84]:
rmse = mean_squared_error(y4_train, y4_predicted_train)
r2 = r2_score(y4_train, y4_predicted_train)

In [85]:
print('Slope:' ,regression.coef_)
print('Mean squared error: ', rmse)
print('R2 score: ', r2)

Slope: [-2.75084245 -4.44754771 -3.09848596 -3.51784896 -7.78889657 -3.67672705
 -1.40150217 -4.34786387 -4.3475566 ]
Mean squared error:  668.466772972537
R2 score:  0.005648965297671671


After seeing the values of r2 values of test and train sets it indicates a poor model fit between the variables.