In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
from sklearn.svm import LinearSVR
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
import numpy as np
from joblib import dump


In [2]:
df = pd.read_csv("../data/location3.csv").drop(columns =["id","types"], axis=1)

df

types_cols = [col for col in df.columns if col.startswith("type")]

df[types_cols] = df[types_cols].apply(lambda s: s.str.lstrip())

unique_values = set()

# for col in types_cols:
#     unique_values = unique_values.union(set(df["types1"].values))

for col in types_cols:
    unique_values = unique_values.union(set(df[col].values))
    
df["types"] = df[types_cols].apply(lambda row: set(row), axis=1)

boolean_features = {}

for category in unique_values:
    boolean_features[category] = df["types"].apply(lambda s: category in s)
    
boolean_df = pd.DataFrame(boolean_features)

boolean_df

df = pd.concat([df, boolean_df], axis=1).drop([*types_cols, "types"], axis=1)
df.columns

Index([                   'name',                  'rating',
            'user_ratings_total',                'latitude',
                     'longitude',         'latitude_region',
                             nan,           'shopping_mall',
                'clothing_store',         'natural_feature',
                 'travel_agency',                    'food',
             'point_of_interest',                   'store',
                 'meal_delivery',                    'park',
                  'liquor_store',             'gas_station',
                    'restaurant',                    'cafe',
                 'meal_takeaway',           'movie_theater',
        'grocery_or_supermarket',              'night_club',
                     'hair_care',                 'finance',
                           'bar',             'art_gallery',
            'tourist_attraction',                'cemetery',
                   'supermarket',      'real_estate_agency',
                        

In [3]:
df

Unnamed: 0,name,rating,user_ratings_total,latitude,longitude,latitude_region,NaN,shopping_mall,clothing_store,natural_feature,...,jewelry_store,place_of_worship,church,local_government_office,home_goods_store,museum,car_wash,car_repair,aquarium,establishment
0,Mikado Ryotei,4.5,369.0,30.373440,-97.724686,North,True,False,False,False,...,False,False,False,False,False,False,False,False,False,True
1,Earl J. Pomerleau Pocket Park,4.3,6.0,30.318483,-97.686791,North,True,False,False,False,...,False,False,False,False,False,False,False,False,False,True
2,La Placita,4.4,241.0,30.194589,-97.745189,Central,True,False,False,False,...,False,False,False,False,False,False,False,False,False,True
3,Dittmar District Park,4.4,297.0,30.182571,-97.799721,Central,True,False,False,False,...,False,False,False,False,False,False,False,False,False,True
4,Rosa's Café & Tortilla Factory,4.2,1305.0,30.171329,-97.799098,Central,True,False,False,False,...,False,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
382,Heritage Oaks Neighborhood Park,4.7,43.0,30.234326,-97.737227,North,True,False,False,False,...,False,False,False,False,False,False,False,False,False,True
383,Centre North,3.6,16.0,30.351838,-97.709802,North,True,True,False,False,...,False,False,False,False,False,False,False,False,False,True
384,Oak Acres,4.8,5.0,30.237034,-97.846914,North,True,True,False,False,...,False,False,False,False,False,False,False,False,False,True
385,Burgerlicious,5.0,18.0,30.272704,-97.728530,North,True,False,False,False,...,False,False,False,False,False,False,False,False,False,True


In [4]:
df.iloc[1]

name                       Earl J. Pomerleau Pocket Park
rating                                               4.3
user_ratings_total                                     6
latitude                                         30.3185
longitude                                       -97.6868
latitude_region                                    North
NaN                                                 True
shopping_mall                                      False
clothing_store                                     False
natural_feature                                    False
travel_agency                                      False
food                                               False
point_of_interest                                   True
store                                              False
meal_delivery                                      False
park                                                True
liquor_store                                       False
gas_station                    

In [5]:
df = df.set_index("name")

In [6]:
target = "rating"

In [7]:
X = df.drop(
     columns=[
        "rating",
        "latitude_region",
        "user_ratings_total",
    ],
    axis=1,
).values
y = df[target].values

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=20)

In [9]:
linear_regression = LinearRegression()

In [10]:
linear_regression.fit(X_train, y_train)

LinearRegression()

In [11]:
y_test_pred = linear_regression.predict(X_test)

In [12]:
mean_squared_error(y_test, y_test_pred)

0.35881931782726856

In [13]:
decision_tree = DecisionTreeRegressor()
decision_tree.fit(X_train, y_train)
y_test_pred = decision_tree.predict(X_test)
mean_squared_error(y_test, y_test_pred)

0.5539175257731959

In [14]:
random_forest = RandomForestRegressor()
random_forest.fit(X_train, y_train)
y_test_pred = random_forest.predict(X_test)
mean_squared_error(y_test, y_test_pred)

0.3743284432989688

In [15]:
svr = LinearSVR()
svr.fit(X_train, y_train)
y_test_pred = svr.predict(X_test)
mean_squared_error(y_test, y_test_pred)



0.34465517976986537

In [16]:
la_placita = (
    df.drop(
        columns=[
            "rating",
            "latitude_region",
            "user_ratings_total",
        ],
        axis=1,
    )
    .loc["La Placita"]
    .values
)
random_forest.predict([la_placita])

array([4.378])

In [17]:
linear_regression.predict([la_placita])

array([4.58177474])

In [18]:
lr_pipeline = make_pipeline(LinearRegression())
decision_tree_pipeline = make_pipeline(DecisionTreeRegressor())
random_forest_pipeline = make_pipeline(RandomForestRegressor())

In [19]:
k_fold = KFold(n_splits=6, shuffle=True, random_state=45)

In [20]:
cross_validate(lr_pipeline, X, y, scoring="neg_mean_squared_error", cv=k_fold)[
    "test_score"
].mean()

-0.3372398723594841

In [21]:
cross_validate(
    decision_tree_pipeline, X, y, scoring="neg_mean_squared_error", cv=k_fold
)["test_score"].mean()

-0.4418601762820513

In [22]:
cross_validate(
    random_forest_pipeline, X, y, scoring="neg_mean_squared_error", cv=k_fold
)["test_score"].mean()

-0.35660218617788475

In [24]:
dump(lr_pipeline, "lr.joblib")

['lr.joblib']