In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from tensorflow import feature_column
from tensorflow.keras import layers
from matplotlib import pyplot as plt

# The following lines adjust the granularity of reporting.
pd.options.display.max_rows = 10
pd.options.display.float_format = "{:.1f}".format
# tf.keras.backend.set_floatx('float32')

In [None]:
train = pd.read_csv('/kaggle/input/Train.csv')
train.columns = train.columns.str.replace(' ', '')
corr_features = train[['X', 'Y', 'target_2015', 'elevation']].copy()
train.describe()

In [None]:
hist = corr_features.hist(figsize=(20, 20))

In [None]:
corr_features.corr()

In [None]:
train.head()

In [None]:
f, ax = plt.subplots(figsize=(10,6))
corr= corr_features.corr()
heatmap = sns.heatmap(round(corr, 2), annot=True, ax=ax, cmap='coolwarm',fmt='.2f',linewidths=.05)

In [None]:
# cross 'X' and 'Y'
train['XY'] = train['X']*train['Y']
train["XY_elevation"] = train['XY'] * train['elevation']
train.describe()

In [None]:
# Scatter Plot
plt.scatter(train['XY'], train['target_2015'], alpha=0.4, edgecolors='w')

In [None]:
train['total_precip_2015'] = 0
train['total_precip_2019'] = 0
count_2015=0
count_2019=0
for col in train.columns:
    if len(col) == 27:
        if col[9] == "5":
            count_2015 += 1
            train['total_precip_2015'] += train[col]
        elif  col[9] == "9":
            count_2019 += 1
            train['total_precip_2019'] += train[col]
        else:
            continue
    else:
        continue
train["ave_precip_2015"] = train['total_precip_2015'] / count_2015
train["ave_precip_2019"] = train['total_precip_2019'] / count_2019
train.describe()

In [None]:
# Drop precip columns
# since we have computed total and average rainfall
for col in train.columns:
    if len(col) == 27:
        train = train.drop(columns=[col])
    else:
        continue
test = train.copy()

In [None]:
train.describe()

In [None]:
test.describe()

In [None]:
# Heat mat
cols = ['X', 'Y', 'XY', 'elevation', 'LC_Type1_mode', 'total_precip_2015', 'ave_precip_2015', 'target_2015']
f, ax = plt.subplots(figsize=(15,10))
corr= corr_features.corr()
heatmap = sns.heatmap(round(train[cols], 2), annot=False, ax=ax, cmap='coolwarm')

In [None]:
from mpl_toolkits import mplot3d

fig = plt.figure(figsize=(20, 15))
ax = fig.add_subplot(111, projection='3d')

xs = train['X']
ys = train['Y']
zs = train['target_2015']
ax.scatter(xs, ys, zs, s=50, alpha=0.6, edgecolors='w')

ax.set_xlabel('latitude')
ax.set_ylabel('longitude')
ax.set_zlabel('target_2015')

plt.show()

In [None]:
fig = plt.gcf()
fig.set_size_inches(15, 15)

plt.scatter(x = train['X'], 
            y = train['Y'], 
            s = train['target_2015']*1000, # <== 😀 Look here!
            alpha=0.4, 
            edgecolors='w')

plt.xlabel('latitude')
plt.ylabel('longitude')
plt.title('Longitude - Latitude - Target_2015', y=1.05)

In [None]:
import plotly.express as px
import plotly.graph_objects as go

fig = plt.gcf()
fig.set_size_inches(50, 50)

fig = px.scatter_3d(train, x='X', y='Y', z='elevation', color='target_2015', size_max=2)
fig.update_layout(title='Elevation', autosize=True,
                      width=500, height=500,
                      margin=dict(l=65, r=50, b=65, t=90))
fig.show()

# fig = go.Figure(data=[go.Scatter3d(x=train['X'], y=train['Y'], z=train['elevation'], color=train['target_2015'],
#                                    mode='markers')])
# fig.show()

# fig = go.Figure(data=[go.Surface(x=train['X'], y=train['Y'], z=train['elevation'],  surfacecolor=train['target_2015'])])
# fig.show()

In [None]:
test = train.copy()

# Drop precip columns from train and test
for col in train.columns:
    if len(col) == 27:
        if col[9] == "5":
            train = train.drop(columns=[col])
        elif  col[9] == "9":
            test = test.drop(columns=[col])
        else:
            continue
    else:
        continue

In [None]:
train.head()

In [None]:
train_unpivoted = train.melt(id_vars=['Square_ID'], value_vars=['total_precip_2015', 'total_precip_2019'], var_name='year', value_name='total_precip')
train_unpivoted

In [None]:
train_df = train_unpivoted.iloc[:16466]
test_df = train_unpivoted.iloc[16466:]
train_df = train_df.set_index('Square_ID').join(train.set_index('Square_ID')).reset_index()
train_df

In [None]:
test_df = test_df.set_index('Square_ID').join(train.set_index('Square_ID')).reset_index()
test_df

In [None]:
train = train_df.copy()
test = test_df.drop(columns=['target_2015'])
train.describe()

In [None]:
test.describe()

In [None]:
train.head(5)

In [None]:
train['target_2015'].unique()

In [None]:
train.shape

In [None]:
test.head(2)

In [None]:
loc = np.append(train['year'].values, test['year'].values, axis=0)
from sklearn.preprocessing import LabelEncoder
l=LabelEncoder()
l.fit(list(set(loc)))
train['year']=l.transform(train['year'])
test['year']=l.transform(test['year'])

In [None]:
o=test['Square_ID']

In [None]:
del train['Square_ID']
del test['Square_ID']

In [None]:
train_df=train
test_df=test

In [None]:
X = train_df.drop(labels=['target_2015'], axis=1)
y = train_df['target_2015'].values

from sklearn.model_selection import train_test_split
X_train, X_cv, y_train, y_cv = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
X_train.shape, y_train.shape, X_cv.shape, y_cv.shape

In [None]:
from math import sqrt 
from sklearn.metrics import mean_squared_error

In [None]:
import lightgbm as lgb
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_cv, label=y_cv)

param = {'objective': 'regression',
         'num_leaves':500,
         'boosting': 'gbdt',  
         'metric': 'mae',
         'learning_rate': 0.1,
         'num_iterations': 1000,
         'num_leaves': 80,
         'max_depth': 8,
         'min_data_in_leaf': 11,
         'bagging_fraction': 0.90,
         'bagging_freq': 1,
         'bagging_seed': 101,
         'feature_fraction': 0.90,
         'feature_fraction_seed': 2,
         'max_bin': 250
         }

lgbm = lgb.train(params=param, verbose_eval=100, train_set=train_data, valid_sets=[test_data])

y_pred_lgbm = lgbm.predict(X_cv)
print('RMSLE:', sqrt(mean_squared_error(np.expm1(y_cv), np.expm1(y_pred_lgbm))))

In [None]:
import seaborn as sns
feature_imp = pd.DataFrame(sorted(zip(lgbm.feature_importance(), X.columns), reverse=True)[:50], 
                           columns=['Value','Feature'])
plt.figure(figsize=(12, 10))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False))
plt.title('LightGBM Features')
plt.tight_layout()
plt.show()

In [None]:
Xtest = test

In [None]:
from sklearn.model_selection import KFold
from lightgbm import LGBMRegressor

errlgb = []
y_pred_totlgb = []

fold = KFold(n_splits=
             4, shuffle=True, random_state=101)

for train_index, test_index in fold.split(X):
    X_train, X_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = y[train_index], y[test_index]

    lgbm = LGBMRegressor(boosting_type='gbdt', num_leaves=100, max_depth=31, learning_rate=0.01, n_estimators=1000, min_child_samples=20, subsample=0.9, bagging_fraction=0.90, feature_fraction=0.90, bagging_freq=1,bagging_seed=101)
    lgbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=0, early_stopping_rounds=100)

    y_pred_lgbm = lgbm.predict(X_test)
    print("RMSLE LGBM: ", sqrt(mean_squared_error(np.exp(y_test), np.exp(y_pred_lgbm))))

    errlgb.append(sqrt(mean_squared_error(np.exp(y_test), np.exp(y_pred_lgbm))))
    p = lgbm.predict(Xtest)
    y_pred_totlgb.append(p)

In [None]:
np.mean(errlgb,0)

In [None]:
lgbm_final = np.mean(y_pred_totlgb,0)
lgbm_final

In [None]:
submission = pd.DataFrame({
        "Square_ID": o,
        "target_2019": lgbm_final
    })
submission.to_csv('./submission2.csv', index=False)
print(submission)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
gb = GradientBoostingRegressor(verbose=1, learning_rate=0.1, n_estimators=500, random_state=101, subsample=0.8, loss='ls')
gb.fit(X_train, y_train)
y_pred = gb.predict(X_cv)
print('score', sqrt(mean_squared_error(y_cv, y_pred)))

In [None]:
import seaborn as sns
feature_imp = pd.DataFrame(sorted(zip(gb.feature_importances_, X.columns), reverse=True)[:60], columns=['Value','Feature'])
plt.figure(figsize=(12,10))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False))
plt.title('Gradient Boosting Features')
plt.tight_layout()
plt.show()

In [None]:
from sklearn.model_selection import KFold

errgb = []
y_pred_totgb = []

fold = KFold(n_splits=4, shuffle=True, random_state=101)

for train_index, test_index in fold.split(X):
    X_train, X_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = y[train_index], y[test_index]
             
    #gb = GradientBoostingRegressor(learning_rate=0.9, n_estimators=100, random_state=101, subsample=0.8, loss='ls')
    #gb = GradientBoostingRegressor(loss='ls', learning_rate=0.1, n_estimators=1000, subsample=0.8, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0, min_impurity_split=None, init=None, random_state=None, max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False, presort='deprecated', validation_fraction=0.1, n_iter_no_change=None, tol=0.0001, ccp_alpha=0.0)
    gb = GradientBoostingRegressor(learning_rate=0.1, n_estimators=1500, random_state=101, subsample=0.8, loss='ls')
    gb.fit(X_train, y_train)
    y_pred = gb.predict(X_test)
    print('Score', sqrt(mean_squared_error(y_test, y_pred)))

    errgb.append(sqrt(mean_squared_error(y_test, y_pred)))
    p = gb.predict(Xtest)
    y_pred_totgb.append(p)

In [None]:
np.mean(errgb,0)

In [None]:
gb = np.mean(y_pred_totgb,0)
gb

In [None]:
submission = pd.DataFrame({
        "Square_ID": o,
        "target_2019": gb
    })
submission.to_csv('./submission5.csv', index=False)
print(submission)

In [None]:
def normalize_feature(*args):
    for arg in args:
        feature_name = 'norm_'+arg
        train[feature_name] = (train[arg].copy() - train[arg].mean())/train[arg].std()
#        test[feature_name] = (test[arg].copy() - test[arg].mean())/test[arg].std()
        
 #       test['feature_norm'] = (test[test_feature] - test[test_feature].mean())/test[test_feature].std()

In [None]:
# Create an empty list that will eventually hold all created feature columns.
feature_columns = []

# Invoke the normalize feature method
normalize_feature("elevation", "XY")

# Create a numerical feature column to represent feature 1.
feature_1 = tf.feature_column.numeric_column("norm_elevation")
feature_columns.append(feature_1)

# Create a numerical feature column to represent feature 2.
feature_2 = tf.feature_column.numeric_column("norm_XY")
feature_columns.append(feature_2)

# Create a numerical feature column to represent longitude.
# train["XY_elevation"] = train['XY'] * train['elevation']
# test["XY_elevation"] = test['XY'] * test['elevation']
normalize_feature("XY_elevation")
feature_3 = tf.feature_column.numeric_column("norm_XY_elevation")
feature_columns.append(feature_3)

# Create a numerical feature column to represent longitude.
normalize_feature("total_precip")
feature_4 = tf.feature_column.numeric_column("norm_total_precip")
feature_columns.append(feature_4)

# Create a numerical feature column to represent longitude.
normalize_feature("LC_Type1_mode")
feature_5 = tf.feature_column.numeric_column("norm_LC_Type1_mode")
feature_columns.append(feature_5)

print(feature_columns)

# Convert the list of feature columns into a layer that will later be fed into
# the model. 
feature_layer = layers.DenseFeatures(feature_columns)

# Print the first 3 and last 3 rows of the feature_layer's output when applied
# to train_df_norm:
feature_layer(dict(train))

In [None]:
#@title Define the functions that build and train a model
def build_model(my_learning_rate):
  """Create and compile a simple linear regression model."""
  # Most simple tf.keras models are sequential.
  model = tf.keras.models.Sequential()
  model.add(feature_layer)
  model.add(tf.keras.layers.Dense(units=1,
                                  input_shape=(1,),
                                  activation='relu',
                                  #kernel_regularizer=tf.keras.regularizers.l1(l=0.00001),
                                  kernel_regularizer=tf.keras.regularizers.l2(l=0.00001)
                                 ))

  # Compile the model topography into code that TensorFlow can efficiently
  # execute. Configure training to minimize the model's mean squared error. 
  model.compile(optimizer=tf.keras.optimizers.RMSprop(lr=my_learning_rate),
                loss="mean_squared_error",
                metrics=[tf.keras.metrics.RootMeanSquaredError()])

  return model        


def train_model(model, dataset, label_name, epochs, batch_size, shuffle=True, my_validation_split=0.2):
  """Feed a dataset into the model in order to train it."""

  # The x parameter of tf.keras.Model.fit can be a list of arrays, where
  # each array contains the data for one feature.  Here, we're passing
  # every column in the dataset. Note that the feature_layer will filter
  # away most of those columns, leaving only the desired columns and their
  # representations as features.
  features = {name:np.array(value) for name, value in dataset.items()}
  label = np.array(features.pop(label_name)) 
  history = model.fit(x=features, y=label, batch_size=batch_size,
                      epochs=epochs, shuffle=shuffle)

#   history = model.fit(x=features, y=label, batch_size=batch_size,
#                       epochs=epochs, shuffle=shuffle,
#                       validation_split=my_validation_split)

  # Gather the trained model's weight and bias.
  print(model.get_weights())
  trained_weight = model.get_weights()[0]
  trained_bias = model.get_weights()[1]

  # The list of epochs is stored separately from the rest of history.
  epochs = history.epoch
  
  # Isolate the error for each epoch.
  hist = pd.DataFrame(history.history)

  # To track the progression of training, we're going to take a snapshot
  # of the model's root mean squared error at each epoch. 
  rmse = hist["root_mean_squared_error"]

  return trained_weight, trained_bias, epochs, rmse

print("Defined the create_model and traing_model functions.")

In [None]:
#@title Define the plotting functions
def plot_the_model(trained_weight, trained_bias, feature, label):
  """Plot the trained model against 200 random training examples."""

  # Label the axes.
  plt.xlabel(feature)
  plt.ylabel(label)

  # Create a scatter plot from 200 random points of the dataset.
  random_examples = train.sample(n=300)
  plt.scatter(random_examples[feature], random_examples[label])

  # Create a red line representing the model. The red line starts
  # at coordinates (x0, y0) and ends at coordinates (x1, y1).
  x0 = -2.5
  y0 = trained_bias
  x1 = 2.5
  y1 = trained_bias + (trained_weight * x1)
  plt.plot([x0, x1], [y0, y1], c='r')

  # Render the scatter plot and the red line.
  plt.show()
    

# Needs to be updated to plot the line of best fit
#@title Define the plotting functions
def plot_the_model_plotly(trained_weight_1, trained_weight_2, trained_bias, feature_1, feature_2, label):
  """Plot the trained model against 200 random training examples."""

#   # Label the axes.
#   plt.xlabel(feature_1)
#   plt.xlabel(feature_2)
#   plt.ylabel(label)


  # Create a surface plot representing the model
  x, y = np.linspace(0, -10, 20).reshape(4, 5), np.linspace(0, -10, 20).reshape(4, 5)
  z = trained_bias + (trained_weight_1 * x) + (trained_weight_2 * y)
  fig = go.Figure(data=[go.Surface(z=z), go.Scatter3d(x=train[feature_1], y=train[feature_2], z=train[label],
                                   mode='markers')])
  fig.update_layout(title='Fit and scatter', autosize=True,
                      width=500, height=500,
                      margin=dict(l=65, r=50, b=65, t=90))

#   # Create a scatter plot from 200 random points of the dataset.
#   random_examples = train.sample(n=200)
#   fig1 = px.scatter_3d(random_examples, x=feature_1, y=feature_2, z=label, size_max=18)
    
#   # Create a red line representing the model. The red line starts
#   # at coordinates (x0, y0) and ends at coordinates (x1, y1).
#   x0 = -16.8
#   y0 = trained_bias
#   x1 = -15.2
#   x2 = 35.5
#   y1 = trained_bias + (trained_weight_1 * x1) + (trained_weight_2 * x2)
#   plt.plot([x0, x1], [y0, y1], c='r')

  # Render the scatter plot and the red line.
  fig.show()


def plot_the_loss_curve(epochs, rmse):
  """Plot a curve of loss vs. epoch."""

  plt.figure()
  plt.xlabel("Epoch")
  plt.ylabel("Root Mean Squared Error")

  plt.plot(epochs, rmse, label="Loss")
  plt.legend()
  plt.ylim([rmse.min()*0.95, rmse.max()*1.03])
#   plt.ylim(0, 0.5)
  plt.show()  

print("Defined the plot_the_model and plot_the_loss_curve functions.")

In [None]:
import tensorflow as tf

# The following variables are the hyperparameters.
learning_rate = 0.01 # originally 0.01
epochs = 20000
batch_size = 10000
validation_split = 0.20

# # Invoke the normalize feature method
# normalize_feature("ave_precip_2015", "ave_precip_2019")

# Specify the feature and the label.
# my_feature = "feature_norm"  # the total number of rooms on a specific city block.

my_label="target_2015" # the median value of a house on a specific city block.
# That is, you're going to create a model that predicts house value based 
# solely on total_rooms.  

# Discard any pre-existing version of the model.
my_model = None

# Invoke the functions.
my_model = build_model(learning_rate)
# weight, bias, epochs, rmse = train_model(my_model, train, 
#                                          my_feature, my_label,
#                                          epochs, batch_size)
weight, bias, epochs, rmse = train_model(my_model, train, my_label,
                                         epochs, batch_size)

# weight, bias, epochs, rmse = train_model(my_model, train, my_label,
#                                          epochs, batch_size, validation_split)


print("\nThe w1 learned weight for your model is %.4f" % weight[0])
print("\nThe w2 learned weight for your model is %.4f" % weight[1])
print("\nAll weights for your model are: "+ str(weight))
print("The learned bias for your model is %.4f\n" % bias )

# plot_the_model_plot(weight, bias, my_feature, my_label)
plot_the_model_plotly(weight[0], weight[1], bias, "norm_elevation", "norm_XY", my_label)
plot_the_loss_curve(epochs, rmse)

In [None]:
def predict_target_2015(n, my_feature, my_label):
  """Predict the extent of flooding based on a feature."""

  batch = validate[my_feature].sample(n=300, random_state=1)
  predicted_values = my_model.predict_on_batch(x=batch)

  features = {name:np.array(value) for name, value in test.items()}
  label = np.array(features.pop(my_label))

  my_model.evaluate(x = features, y = label, batch_size=batch_size)

  print("feature   label          predicted          variance")
  print("  value   value          value              value")
  print("--------------------------------------")
  for i in range(n):
    print ("%5.0f %6.0f %15.0f %15.0f " % (train[my_feature][i],
                                   train[label][i],
                                   predicted_values[i][0],
                                   train[label][i]-predicted_values[i][0]))
    
 
# Needs to be reworked
def predict_target_2019(n, feature, label):
  """Predict the extent of flooding based on a feature."""

  # Test using the test set
  test = train.drop(columns=['target_2015'])
  print("\n: Evaluate the new model against the test set:")
  test_features = {name:np.array(value) for name, value in test.items()}

  batch = test_features
  predicted_values = my_model.predict_on_batch(x=batch)

#   print("feature   label")
#   print("  value   value")
#   print("--------------------------------------")
#   for i in range(n):
#     print ("%5.0f %6.0f" % (test['Suquare_ID'][i],
#                                    predicted_values[i][0]))
  np.savetxt("Akashtop1.csv", predicted_values, delimiter=",")

In [None]:
#Invoke the target prediction on 10 examples:
predict_target_2019(16466, 'my_feature', my_label)