<a href="https://colab.research.google.com/github/ebrarkiziloglu/My-Basic-ML-Models/blob/main/People-height-and-weight/people_random_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

random_data_file_path = 'people_1000_records_age&heights&weight.csv'
random_data = pd.read_csv(random_data_file_path)

# dropna drops missing values (think of na as "not available")
random_data = random_data.dropna(axis=0)

random_data.columns

Index(['id', 'name', 'age', 'height', 'weight'], dtype='object')

In [7]:
y = random_data.weight
features = ['age', 'height']
X = random_data[features]
X.describe()

Unnamed: 0,age,height
count,1000.0,1000.0
mean,54.889,180.448
std,25.693439,17.049281
min,11.0,151.0
25%,33.75,166.0
50%,54.0,180.0
75%,78.0,195.0
max,100.0,210.0


In [3]:
random_data.head()

Unnamed: 0,id,name,age,height,weight
0,1,name_1,61,155.0,158.0
1,2,name_2,14,166.0,74.0
2,3,name_3,24,162.0,131.0
3,4,name_4,94,159.0,212.0
4,5,name_5,39,194.0,161.0


In [4]:
from sklearn.tree import DecisionTreeRegressor

# Define model. Specify a number for random_state to ensure same results each run
random_model = DecisionTreeRegressor(random_state=1)

# Fit model
random_model.fit(X, y)

print("Making predictions for the following 5 people:")
print(X.head())
print("The predictions are")
print(random_model.predict(X.head()))

from sklearn.metrics import mean_absolute_error

predicted_weight = random_model.predict(X)
# The following mean error will be very very small since the model and the validation data is the same:
mean_absolute_error(y, predicted_weight)

Making predictions for the following 5 people:
   age  height
0   61   155.0
1   14   166.0
2   24   162.0
3   94   159.0
4   39   194.0
The predictions are
[158.  74. 131. 212. 161.]


0.9403333333333334

In [5]:
# Now, let's break up the data into two pieces, using the function train_test_split of scikit-learn library: 
# We'll use some of that data as training data to fit the model
# Then, we'll use the other data as validation data to calculate mean_absolute_error:

from sklearn.model_selection import train_test_split

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)

# Define the model:
better_model = DecisionTreeRegressor()

# Fit the model:
better_model.fit(train_X, train_y)

# get predicted prices on validation data:
val_predictions = better_model.predict(val_X)

# The following mean error will be bigger than the previous case, since the model and the validation data are different:
print(mean_absolute_error(val_y, val_predictions))

28.15


Let's now make the similar calculations for the logically more accurate data:

In [6]:
more_logical_data_file_path = 'people_3000_records_more_logical_data.csv'
logical_data = pd.read_csv(more_logical_data_file_path)

# dropna drops missing values (think of na as "not available")
logical_data = logical_data.dropna(axis=0)

logical_data.columns

Index(['id', 'name', 'age', 'height', 'weight'], dtype='object')

In [10]:
logical_y = logical_data.weight
features = ['age', 'height']
logical_X = logical_data[features]
logical_X.describe()

Unnamed: 0,age,height
count,3000.0,3000.0
mean,25.593667,159.376333
std,11.702259,33.781017
min,2.0,51.0
25%,15.75,153.0
50%,30.5,167.0
75%,35.0,181.0
max,40.0,200.0


In [13]:
# Define model. Specify a number for random_state to ensure same results each run
logical_model = DecisionTreeRegressor(random_state=1)

# Fit model
logical_model.fit(logical_X, logical_y)

print("Making predictions for the following 5 people:")
print(logical_X.head())
print("The predictions are")
print(logical_model.predict(logical_X.head()))

predicted_weight = logical_model.predict(logical_X)
# The following mean error will be very very small since the model and the validation data is the same:
mean_absolute_error(logical_y, predicted_weight)

Making predictions for the following 5 people:
   age  height
0    2    51.0
1    8   118.0
2    9    73.0
3    3   110.0
4    5   133.0
The predictions are
[38.  26.  32.  28.  19.5]


4.657599591149592

In [14]:
# Now, we'll again split the data into two parts:

train_X_logical, val_X_logical, train_y_logical, val_y_logical = train_test_split(logical_X, logical_y, random_state = 0)

# Define the model:
better_logical_model = DecisionTreeRegressor()

# Fit the model:
better_logical_model.fit(train_X_logical, train_y_logical)

# get predicted prices on validation data:
val_predict = better_logical_model.predict(val_X_logical)

# This time error should be smaller because the data is more relaible:
print(mean_absolute_error(val_y_logical, val_predict))

10.233969841269841
