In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# Data Gathering

In [2]:
df = pd.read_csv("iris.csv")
df

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...,...
145,146,6.7,3.0,5.2,2.3,Iris-virginica
146,147,6.3,2.5,5.0,1.9,Iris-virginica
147,148,6.5,3.0,5.2,2.0,Iris-virginica
148,149,6.2,3.4,5.4,2.3,Iris-virginica


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB


In [5]:
df["Species"].value_counts().to_dict()

{'Iris-setosa': 50, 'Iris-versicolor': 50, 'Iris-virginica': 50}

In [6]:
df.replace({'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2}, inplace = True)

In [7]:
df

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,0
1,2,4.9,3.0,1.4,0.2,0
2,3,4.7,3.2,1.3,0.2,0
3,4,4.6,3.1,1.5,0.2,0
4,5,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...,...
145,146,6.7,3.0,5.2,2.3,2
146,147,6.3,2.5,5.0,1.9,2
147,148,6.5,3.0,5.2,2.0,2
148,149,6.2,3.4,5.4,2.3,2


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    int64  
dtypes: float64(4), int64(2)
memory usage: 7.2 KB


In [9]:
df.drop('Id',axis=1,inplace=True)

In [10]:
df

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [11]:
y = df['SepalLengthCm']
x = df.drop('SepalLengthCm',axis=1)

In [12]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.3,random_state=10)

In [13]:
x_train.shape

(105, 4)

In [14]:
x_test.shape

(45, 4)

In [15]:
linear_reg_model = LinearRegression()

linear_reg_model.fit(x_train,y_train)

In [16]:
linear_reg_model.intercept_

1.6057414969300412

In [17]:
linear_reg_model.coef_

array([ 0.67317937,  0.82348314, -0.46451517, -0.35518388])

In [19]:
from sklearn.metrics import mean_squared_error, r2_score

In [20]:
y_pred = linear_reg_model.predict(x_test)

MSE = mean_squared_error(y_test,y_pred)
MSE

0.10740833670732566

In [21]:
r2 = r2_score(y_test,y_pred)
r2

0.8506886237163902

In [22]:
linear_reg_model.score(x_test,y_test)

0.8506886237163902

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   SepalLengthCm  150 non-null    float64
 1   SepalWidthCm   150 non-null    float64
 2   PetalLengthCm  150 non-null    float64
 3   PetalWidthCm   150 non-null    float64
 4   Species        150 non-null    int64  
dtypes: float64(4), int64(1)
memory usage: 6.0 KB


In [27]:
def user_input():
    SepalWidthCm = float(input("SepalWidthCm"))
    PetalLengthCm= float(input("PetalLengthCm"))
    PetalWidthCm = float(input("PetalWidthCm"))
    Species = int(input("Species"))
    
    user_data = np.zeros(4)
    
    user_data[0] = SepalWidthCm
    user_data[1] = PetalLengthCm
    user_data[2] = PetalWidthCm
    user_data[3] = Species
    
    SepalLengthCm = linear_reg_model.predict([user_data])
    print("SepalLengthCm == ",SepalLengthCm)
user_input()

SepalWidthCm 3.5
PetalLengthCm 1.4
PetalWidthCm 0.2
Species 0


SepalLengthCm ==  [5.02184266]




In [28]:
import pickle

In [29]:
with open('linear_reg_model.pkl','wb') as file:
    pickle.dump(linear_reg_model, file)