In [130]:
import pandas as pd
from sklearn.feature_selection import f_regression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_regression

## Testing different models on California Housing data in attempt to achieve 0.8 score on the testing data set

In [131]:
housing_df = pd.read_csv('california-housing(1).csv')
housing_df = housing_df.dropna(axis = "index")
housing_df = housing_df.dropna(axis = "columns")
housing_df = housing_df.drop(columns = ['ocean_proximity'])

In [132]:
y = housing_df['median_house_value']
x = housing_df.drop(columns=['median_house_value'])

In [133]:
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.7)

Attempt #1 - Linear Model

In [134]:
lm = LinearRegression()

In [135]:
lm.fit(x_train, y_train)

LinearRegression()

In [136]:
lm.score(x_test, y_test)

0.6487027597861967

Attempt #2 - KNeighborsRegression - 3 neighbors

In [137]:
three_neighbors = KNeighborsRegressor(n_neighbors=3)

In [138]:
three_neighbors.fit(x_train, y_train)

KNeighborsRegressor(n_neighbors=3)

In [139]:
three_neighbors.score(x_test, y_test)

0.16767602975236828

Attempt #3 - KNeighbors Regression - 5 neighbors

In [140]:
five_neighbors = KNeighborsRegressor(n_neighbors=5)

In [141]:
five_neighbors.fit(x_train, y_train)

KNeighborsRegressor()

In [142]:
five_neighbors.score(x_test, y_test)

0.24014074930244567

Attempt #4 - 5 Feature KBest w/ mutual_info_regression/LinearRegression Pipeline

In [143]:
five_feature_selector = SelectKBest(score_func=mutual_info_regression, k=5)
lm5 = LinearRegression()
lm5_pipeline = make_pipeline(five_feature_selector, lm5)

In [144]:
lm5_pipeline.fit(x_train, y_train)

Pipeline(steps=[('selectkbest',
                 SelectKBest(k=5,
                             score_func=<function mutual_info_regression at 0x000001B2804FEB80>)),
                ('linearregression', LinearRegression())])

In [145]:
lm5_pipeline.score(x_test, y_test)

0.6085936227964088

Attempt #5 - 3 Feature KBest w/ f_regression/Linear Regression Pipeline

In [146]:
three_feature_selector = SelectKBest(score_func=f_regression, k=3)
lm3 = LinearRegression()
lm3_pipeline = make_pipeline(three_feature_selector, lm3)

In [147]:
lm3_pipeline.fit(x_train, y_train)

Pipeline(steps=[('selectkbest',
                 SelectKBest(k=3,
                             score_func=<function f_regression at 0x000001B2FF3C8F70>)),
                ('linearregression', LinearRegression())])

In [148]:
lm3_pipeline.score(x_test, y_test)

0.4911317669081505

It appears that Linear Regression by itself models this data the best with 0.63 as a score on the testing data

Based on the graph and data, this makes sense as the graph is somewhat in a straight line. X and Y correlate with each other as data increases/decreases