In [None]:
# Copied from our Part 2 Notebook
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

train = pd.read_csv('/kaggle/input/local-hack-day-data-whats-that-house-worth/train.csv')
test = pd.read_csv('/kaggle/input/local-hack-day-data-whats-that-house-worth/test.csv')

average_home_prices_by_state = train.groupby('state')['price'].mean()
average_home_prices_by_city = train.groupby('city')['price'].mean()
average_home_prices_by_zipcode = train.groupby('zipcode')['price'].mean()

def convert_lot_area(row):
    if row['lotUnit'] == 'acres':
        return row['lotArea'] * 43560
    else:
        return row['lotArea']

def preprocess(df):
    df = df.drop(['homeStatus', 'dateSold', 'address'], axis=1)

    df['lotArea'] = df.apply(convert_lot_area, axis=1)
    df = df.drop('lotUnit', axis=1)
    df = pd.get_dummies(df, columns=['homeType'])
    
    # Add average price per state
    df = df.merge(average_home_prices_by_state, how='left', on='state')
    if 'price' in df.columns:
        df['average_state_price'] = df['price']
        df = df.drop('price', axis=1)
    else:
        df['price'] = df['price_x']
        df['average_state_price'] = df['price_y']
        df = df.drop(['price_x', 'price_y'], axis=1)
    df['average_state_price'] = df['average_state_price'].fillna(df['average_state_price'].mean())

    # Add average price per city
    df = df.merge(average_home_prices_by_city, how='left', on='city')
    if 'price' in df.columns:
        df['average_city_price'] = df['price']
        df = df.drop('price', axis=1)
    else:
        df['price'] = df['price_x']
        df['average_city_price'] = df['price_y']
        df = df.drop(['price_x', 'price_y'], axis=1)
    df['average_city_price'] = df['average_city_price'].fillna(df['average_city_price'].mean())
        
    # Add average price per zipcode
    df = df.merge(average_home_prices_by_zipcode, how='left', on='zipcode')
    if 'price' in df.columns:
        df['average_zipcode_price'] = df['price']
        df = df.drop('price', axis=1)
    else:
        df['price'] = df['price_x']
        df['average_zipcode_price'] = df['price_y']
        df = df.drop(['price_x', 'price_y'], axis=1)
    df['average_zipcode_price'] = df['average_zipcode_price'].fillna(df['average_zipcode_price'].mean())

    df = df.drop(['state', 'city', 'zipcode'], axis=1)
    return df

clean_train = preprocess(train)
clean_test = preprocess(test)

In [None]:
clean_train

# What is a model?
In general a model is a simplified version of something that helps you understand it.

In machine learning our models usually look like a function: for some intput, our model predicts an output.

- With chatbots the input is the history of the conversation, and the output is the next word.
- With a medical diagnostic tool the input could be an X-ray, and the output could be the diagnosis
- For our challenge, the input is everything we know about the house, and the output is how much it sells for

One of the simplest possible models is just to take the average of all the home prices, and always guess that.

In [None]:
average_price = clean_train['price'].mean()

average_price

In [None]:
mean_model_df = clean_train.copy()

mean_model_df['predicted'] = 338423

mean_model_df['absolute_error'] = np.abs(mean_model_df['predicted'] - mean_model_df['price'])

mean_model_df

In [None]:
plt.hist(mean_model_df['absolute_error'])
mean_model_df['absolute_error'].mean()

# Square Footage Model

In [None]:
square_footage_model_df = clean_train.copy()
average_price_per_square_foot = (square_footage_model_df['price'] / square_footage_model_df['livingArea']).mean()

average_price_per_square_foot

In [None]:
square_footage_model_df['predicted'] = 195 * square_footage_model_df['livingArea']

np.abs(square_footage_model_df['predicted'] - square_footage_model_df['price']).mean()

# Linear Regression!

In [None]:
from sklearn.linear_model import LinearRegression

lin_reg_df = clean_train.copy()

lin_reg_with_living_area_model = LinearRegression()

input_data = lin_reg_df[['livingArea']]
output_data = lin_reg_df['price']

lin_reg_with_living_area_model.fit(input_data, output_data)

In [None]:
lin_reg_with_living_area_model.coef_

In [None]:
lin_reg_with_living_area_model.intercept_

In [None]:
lin_reg_df['predicted'] = lin_reg_with_living_area_model.predict(input_data)

(np.abs(lin_reg_df['predicted'] - lin_reg_df['price'])).mean()

In [None]:
from sklearn.metrics import mean_absolute_error

mean_absolute_error(lin_reg_df['predicted'], lin_reg_df['price'])

In [None]:
lin_reg_df['livingAreaCubed'] = lin_reg_df['livingArea'] ** 3
lin_reg_df['livingAreaSquared'] = lin_reg_df['livingArea'] ** 2
lin_reg_df['livingAreaRooted'] = lin_reg_df['livingArea'] ** 0.5

input_data = lin_reg_df[['livingArea', 'livingAreaSquared', 'livingAreaRooted', 'livingAreaCubed']]
output_data = lin_reg_df['price']

lin_reg_with_living_area_model.fit(input_data, output_data)

lin_reg_df['predicted'] = lin_reg_with_living_area_model.predict(input_data)

mean_absolute_error(lin_reg_df['predicted'], lin_reg_df['price'])

In [None]:
import numpy as np
np.random.seed(0)

In [None]:
columns = ['livingArea', 'livingAreaSquared', 'livingAreaRooted']
for i in range(2,6):
    column = f'livingAreaToThePowerOf{i}'
    columns.append(column)
    lin_reg_df[column] = lin_reg_df['livingArea'] ** i

input_data = lin_reg_df[columns]
output_data = lin_reg_df['price']

linear_regression_with_powers_model = LinearRegression()

linear_regression_with_powers_model.fit(input_data, output_data)

lin_reg_df['predicted'] = linear_regression_with_powers_model.predict(input_data)

mean_absolute_error(lin_reg_df['predicted'], lin_reg_df['price'])

# A digression into linear models

In [None]:
x = np.array(range(20))
y = x * 2

df = pd.DataFrame({'x': x, 'y': y})

plt.plot(df.x, df.y)

In [None]:
lin_reg = LinearRegression()
lin_reg.fit(df[['x']], df['y'])
df['linear_predictions'] = lin_reg.predict(df[['x']])
plt.plot(df['x'], df['linear_predictions'])

In [None]:
x = np.array(range(20))
y = x ** 2

df = pd.DataFrame({'x': x, 'y': y})

plt.plot(df.x, df.y)

In [None]:
lin_reg = LinearRegression()
lin_reg.fit(df[['x']], df['y'])
df['nonlinear_predictions'] = lin_reg.predict(df[['x']])
plt.plot(df['x'], df['y'])
plt.plot(df['x'], df['nonlinear_predictions'])

In [None]:
fake_data = pd.DataFrame({
    'x': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
    # 'y': [-3, 14, 16, 9, 12, 14, 39, 63]
    'y': [0.0, 1.0, 1.4142135623730951, 2.6, 2.0, 2.23606797749979, 2.449489742783178, 2.6457513110645907, 2.8284271247461903, 2.5, 3.1622776601683795, 3.3166247903554]
})

plt.scatter(fake_data['x'], fake_data['y'])

In [None]:
columns = ['x']
predicted_columns = []
for i in range(1,40):
    column = f'xToThePowerOf{i}'
    columns.append(column)
    fake_data[column] = fake_data['x'] ** i
    model = LinearRegression()
    model.fit(fake_data[columns], fake_data['y'])
    predicted_column = f'predictedFrom{i}'
    predicted_columns.append(predicted_column)
    fake_data[predicted_column] = model.predict(fake_data[columns])

In [None]:
for predicted_column in ['predictedFrom1', 'predictedFrom2', 'predictedFrom5', 'predictedFrom9', 'predictedFrom39']:
    plt.title(predicted_column)
    plt.scatter(fake_data['x'], fake_data['y'])
    plt.plot(fake_data[predicted_column])
    plt.show()

In [None]:
from sklearn.model_selection import train_test_split

X = clean_train[['livingArea']]
y = clean_train['price']

errors = []

for i in range(4):
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=i)
  model = LinearRegression()
  model.fit(X_train, y_train)
  predictions = model.predict(X_test)
  errors.append(mean_absolute_error(predictions, y_test))

print(errors)
print(f'average error: {np.mean(errors)}')

In [None]:
X = clean_train.drop(['price'], axis=1)
X

In [None]:
X = clean_train.drop(['price'], axis=1)
y = clean_train['price']

errors = []

for i in range(4):
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=i)
  model = LinearRegression()
  model.fit(X_train, y_train)
  predictions = model.predict(X_train)
  errors.append(mean_absolute_error(predictions, y_train))

print(errors)
print(f'average error: {np.mean(errors)}')

In [None]:
X = clean_train.drop(['price'], axis=1)
y = clean_train['price']

errors = []

for i in range(4):
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=i)
  model = LinearRegression()
  model.fit(X_train, y_train)
  predictions = model.predict(X_test)
  errors.append(mean_absolute_error(predictions, y_test))

print(errors)
print(f'average error: {np.mean(errors)}')

In [None]:
final_model = LinearRegression()
final_model.fit(X, y)

In [None]:
submission = clean_test[['id']].copy()
submission['price'] = final_model.predict(clean_test)

In [None]:
submission
submission.to_csv('submission.csv', index=False)

In [None]:
train['city'].value_counts()[:5]


In [None]:
test['city'].value_counts()[:5]