# Import the dataset

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('50_Startups.csv')
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


# Handle with missing data

In [2]:
df.isnull().sum()

R&D Spend          0
Administration     0
Marketing Spend    0
State              0
Profit             0
dtype: int64

# Change the data into numerical

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 1.8+ KB


In [4]:
from sklearn import preprocessing
label = preprocessing.LabelEncoder()

df['State'] = label.fit_transform(df['State'])

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     int32  
 4   Profit           50 non-null     float64
dtypes: float64(4), int32(1)
memory usage: 1.8 KB


# Train test split for logistic regression

In [6]:
x1 = df.drop('State', axis=1)
y1 = df['State']

In [7]:
from sklearn.model_selection import train_test_split

x1_train,x1_test,y1_train,y1_test = train_test_split(x1, y1, test_size=0.30)

# Logistic regression

In [8]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(x1_train,y1_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [9]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix,accuracy_score

predictions = model.predict(x1_test)
print(classification_report(y1_test, predictions))
print(confusion_matrix(y1_test, predictions))
print(accuracy_score(y1_test, predictions))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         6
           1       0.33      0.17      0.22         6
           2       0.18      0.67      0.29         3

    accuracy                           0.20        15
   macro avg       0.17      0.28      0.17        15
weighted avg       0.17      0.20      0.15        15

[[0 1 5]
 [1 1 4]
 [0 1 2]]
0.2


# Train test split for linear regression

In [10]:
x2 = df.drop('Profit', axis=1)
y2 = df['Profit']

x2_train,x2_test,y2_train,y2_test = train_test_split(x2, y2, test_size=0.30)

# Linear regession

In [11]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import math

model1 = LinearRegression()
model1.fit(x2_train,y2_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [12]:
predictions1 = model1.predict(x2_test)

print(r2_score(y2_test,predictions1))
mse = mean_squared_error(y2_test,predictions1)
rmse = math.sqrt(mse)
print(mse)
print(rmse)

0.9305026183259096
41006303.23479949
6403.616418462265
