In [124]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder,MinMaxScaler,StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,accuracy_score,mean_squared_log_error,r2_score

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold, StratifiedKFold, RandomizedSearchCV
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV, LinearRegression

In [125]:
train_df = pd.read_csv("C:\\Users\\Dashang\\Downloads\\Github\\Stayze_Rent_Predicition\\Stayze_Rent_Predicition_Hackathon\\Dashang\\data\\Train.csv")

In [126]:
test_df = pd.read_csv("C:\\Users\\Dashang\\Downloads\\Github\\Stayze_Rent_Predicition\\Stayze_Rent_Predicition_Hackathon\\Dashang\\data\\Test.csv")

In [109]:
train_df.isna().sum()

id                                   0
name                                10
host_id                              0
host_name                            8
neighbourhood_group                  0
neighbourhood                        0
latitude                             0
longitude                            0
room_type                            0
price                                0
minimum_nights                       0
number_of_reviews                    0
last_review                       4931
reviews_per_month                 4931
calculated_host_listings_count       0
availability_365                     0
dtype: int64

In [110]:
train_df.dtypes

id                                  int64
name                               object
host_id                             int64
host_name                          object
neighbourhood_group                object
neighbourhood                      object
latitude                          float64
longitude                         float64
room_type                          object
price                               int64
minimum_nights                      int64
number_of_reviews                   int64
last_review                        object
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
dtype: object

In [111]:
def Preprocessing(df):
    df['last_review']= pd.to_datetime(df['last_review'],infer_datetime_format=True) 
    df_after_host_name_drop = df.drop(['host_name','name'], axis=1)
    df_after_host_name_drop['reviews_per_month'] = df_after_host_name_drop['reviews_per_month'].fillna(0)
    
    minimum = min(df_after_host_name_drop['last_review'])
    df_after_host_name_drop['last_review'] = df_after_host_name_drop['last_review'].fillna(minimum)
    df_after_host_name_drop['last_review'] = df_after_host_name_drop['last_review'].apply(lambda x: x.toordinal() - minimum.toordinal())
    
    #target Price is skewed
    if 'price' in df.columns:
        df_after_host_name_drop['price']=np.log1p(df_after_host_name_drop["price"])
    
    df_drop_id_host_id =  df_after_host_name_drop.drop(['host_id', 'id'], axis=1)
    df_drop_id_host_id['minimum_nights'] = np.log1p(df_drop_id_host_id['minimum_nights'])
    df_drop_id_host_id['reviews_per_month'] = df_drop_id_host_id[df_drop_id_host_id['reviews_per_month'] < 17.5]['reviews_per_month']
    
    # df_drop_id_host_id['all_year_avail'] = df_drop_id_host_id['availability_365']>353
    # df_drop_id_host_id['low_avail'] = df_drop_id_host_id['availability_365']< 12
    # df_drop_id_host_id['no_reviews'] = df_drop_id_host_id['reviews_per_month']==0
    
    cat_features = df_drop_id_host_id.select_dtypes(include=['object'])
    one_hot = pd.get_dummies(cat_features)
    one_hot.head()
    
    df_drop_id_host_id['reviews_per_month'] = df_drop_id_host_id['reviews_per_month'].fillna(0)
    
    num_feat =  df_drop_id_host_id.select_dtypes(exclude=['object'])
    #y = num_feat.price
    #num_feat = num_feat.drop(['price'], axis=1)
    
    X = np.concatenate((num_feat, one_hot), axis=1)
    processed_df = pd.concat([num_feat, one_hot], axis=1)

    return processed_df

In [112]:
proc_df= Preprocessing(train_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [113]:
proc_df.dtypes

latitude                     float64
longitude                    float64
price                        float64
minimum_nights               float64
number_of_reviews              int64
                              ...   
neighbourhood_Woodlawn         uint8
neighbourhood_Woodside         uint8
room_type_Entire home/apt      uint8
room_type_Private room         uint8
room_type_Shared room          uint8
Length: 231, dtype: object

In [114]:
X=proc_df.drop(['price'],axis=1)

In [115]:
X_columns_list = X.columns

In [116]:
X.shape

(23958, 230)

In [117]:
y=proc_df.price

In [118]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=40)

In [119]:
n_folds = 5

# squared_loss
def rmse_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state = 91).get_n_splits(numerical_features)
    return cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error', cv=kf)

def rmse_lv_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state = 91).get_n_splits(numerical_features)
    return cross_val_score(model, Xlv_train, y_train, scoring='neg_mean_squared_error', cv=kf)

In [120]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
y_pred_train = lr.predict(X_train)
print('Train RMSE:',np.sqrt(mean_squared_error(y_train, lr.predict(X_train))))
print('Test RMSE:',np.sqrt(mean_squared_error(y_test, lr.predict(X_test))))

Train RMSE: 0.46029226866861234
Test RMSE: 378043.4234386376


In [121]:
X.shape

(23958, 230)

In [122]:
X_train.shape

(16770, 230)

## Test DF

In [123]:
test_df.columns

Index(['id', 'name', 'host_id', 'host_name', 'neighbourhood_group',
       'neighbourhood', 'latitude', 'longitude', 'room_type', 'minimum_nights',
       'number_of_reviews', 'last_review', 'reviews_per_month',
       'calculated_host_listings_count', 'availability_365'],
      dtype='object')

In [88]:
proc_df.columns

Index(['latitude', 'longitude', 'price', 'minimum_nights', 'number_of_reviews',
       'last_review', 'reviews_per_month', 'calculated_host_listings_count',
       'availability_365', 'neighbourhood_group_Bronx',
       ...
       'neighbourhood_Whitestone', 'neighbourhood_Williamsbridge',
       'neighbourhood_Williamsburg', 'neighbourhood_Windsor Terrace',
       'neighbourhood_Woodhaven', 'neighbourhood_Woodlawn',
       'neighbourhood_Woodside', 'room_type_Entire home/apt',
       'room_type_Private room', 'room_type_Shared room'],
      dtype='object', length=234)

In [81]:
test_id_col = test_df['id']

In [82]:
proc_test_df= Preprocessing(test_df)

In [83]:
proc_test_df.head()

Unnamed: 0,latitude,longitude,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,neighbourhood_group_Bronx,neighbourhood_group_Brooklyn,...,neighbourhood_Williamsburg,neighbourhood_Willowbrook,neighbourhood_Windsor Terrace,neighbourhood_Woodhaven,neighbourhood_Woodlawn,neighbourhood_Woodrow,neighbourhood_Woodside,room_type_Entire home/apt,room_type_Private room,room_type_Shared room
0,40.66751,-73.95867,1.098612,121,2811,1.7,1,39,0,1,...,0,0,0,0,0,0,0,1,0,0
1,40.75655,-73.9969,0.693147,18,2785,2.49,30,364,0,0,...,0,0,0,0,0,0,0,0,1,0
2,40.69252,-73.99121,0.693147,87,2827,4.29,1,108,0,1,...,0,0,0,0,0,0,0,0,1,0
3,40.77292,-73.90101,2.302585,0,0,0.0,2,365,0,0,...,0,0,0,0,0,0,0,0,1,0
4,40.71863,-73.9498,0.693147,28,2837,14.0,2,20,0,1,...,1,0,0,0,0,0,0,0,1,0


In [86]:
proc_df.shape

(34226, 234)

In [87]:
proc_test_df.shape

(14669, 223)

In [103]:
train_df.neighbourhood.unique().size, train_df.neighbourhood_group.unique().size

(217, 5)

In [104]:
test_df.neighbourhood.unique().size , test_df.neighbourhood_group.unique().size

(207, 5)

In [85]:
lr.predict(proc_test_df)

ValueError: shapes (14669,223) and (233,) not aligned: 223 (dim 1) != 233 (dim 0)