# Load Data and Preprocessing

In [60]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [61]:
train = pd.read_csv("train.csv")
train.head()

Unnamed: 0,id,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,...,DrainageSystems,CoastalVulnerability,Landslides,Watersheds,DeterioratingInfrastructure,PopulationScore,WetlandLoss,InadequatePlanning,PoliticalFactors,FloodProbability
0,0,5,8,5,8,6,4,4,3,3,...,5,3,3,5,4,7,5,7,3,0.445
1,1,6,7,4,4,8,8,3,5,4,...,7,2,0,3,5,3,3,4,3,0.45
2,2,6,5,6,7,3,7,1,5,4,...,7,3,7,5,6,8,2,3,3,0.53
3,3,3,4,6,5,4,8,4,7,6,...,2,4,7,4,4,6,5,7,5,0.535
4,4,5,3,2,6,4,4,3,3,3,...,2,2,6,6,4,1,2,3,5,0.415


In [62]:
train = train.drop(columns=["id"])
train.head()

Unnamed: 0,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,Encroachments,...,DrainageSystems,CoastalVulnerability,Landslides,Watersheds,DeterioratingInfrastructure,PopulationScore,WetlandLoss,InadequatePlanning,PoliticalFactors,FloodProbability
0,5,8,5,8,6,4,4,3,3,4,...,5,3,3,5,4,7,5,7,3,0.445
1,6,7,4,4,8,8,3,5,4,6,...,7,2,0,3,5,3,3,4,3,0.45
2,6,5,6,7,3,7,1,5,4,5,...,7,3,7,5,6,8,2,3,3,0.53
3,3,4,6,5,4,8,4,7,6,8,...,2,4,7,4,4,6,5,7,5,0.535
4,5,3,2,6,4,4,3,3,3,3,...,2,2,6,6,4,1,2,3,5,0.415


In [63]:
X = train.iloc[:,0:20].values
X

array([[5, 8, 5, ..., 5, 7, 3],
       [6, 7, 4, ..., 3, 4, 3],
       [6, 5, 6, ..., 2, 3, 3],
       ...,
       [7, 3, 9, ..., 5, 2, 4],
       [7, 3, 3, ..., 7, 6, 4],
       [4, 5, 6, ..., 7, 7, 8]], dtype=int64)

In [64]:
y = train.iloc[:,-1].values
y

array([0.445, 0.45 , 0.53 , ..., 0.485, 0.495, 0.56 ])

In [65]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Support Vector Regression

In [None]:
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import numpy as np

regr = make_pipeline(StandardScaler(), SVR(kernel='linear',C=1.0, epsilon=0.1))
regr.fit(X_train, y_train)


In [37]:
y_pred = regr.predict(X_test)

In [38]:
from sklearn.metrics import r2_score
print(f"R2 score: {r2_score(y_test, y_pred)}")

R2 score: 0.25731008772139463


# Feature Selection

In [66]:
# pearson's correlation feature selection for numeric input and numeric output
from sklearn.datasets import make_regression
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
# generate dataset
#X, y = make_regression(n_samples=100, n_features=100, n_informative=10)
# define feature selection
fs = SelectKBest(score_func=f_regression, k=20)
# apply feature selection
X_selected = fs.fit_transform(X, y)
print(X_selected.shape)

(1117957, 20)


In [67]:
print(X_selected)

[[5 8 5 ... 5 7 3]
 [6 7 4 ... 3 4 3]
 [6 5 6 ... 2 3 3]
 ...
 [7 3 9 ... 5 2 4]
 [7 3 3 ... 7 6 4]
 [4 5 6 ... 7 7 8]]


# Fit Linear Regression Model

In [68]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size = 0.2, random_state = 0)

In [69]:
from sklearn.linear_model import LinearRegression

reg = LinearRegression() 
reg.fit(X_train, y_train)

In [70]:
# regression coefficients 
print('Coefficients: ', reg.coef_) 
  
# variance score: 1 means perfect prediction 
print('Variancecore: {}'.format(reg.score(X_test, y_test)))

Coefficients:  [0.0056201  0.00565183 0.00566434 0.00568386 0.0056581  0.00566856
 0.00565507 0.00563785 0.00563077 0.00564746 0.00564323 0.00563667
 0.00567494 0.00565175 0.00564836 0.00561318 0.00567771 0.0056439
 0.00561743 0.00563892]
Variancecore: 0.8443390599525149


In [71]:
y_pred_test = reg.predict(X_test)
y_pred_test

array([0.46075131, 0.53922069, 0.53417695, ..., 0.50569833, 0.47740965,
       0.49432981])

In [72]:
from sklearn.metrics import r2_score
print(r2_score(y_test, y_pred_test))

0.8443390599525149


In [73]:
test = pd.read_csv("test.csv")
X = test.drop(columns=["id","FloodProbability"])
#X.shape
y_pred_final = reg.predict(X)
y_pred_final



array([0.57362761, 0.45526461, 0.45475782, ..., 0.62441788, 0.5509352 ,
       0.51154771])

In [32]:
import pandas as pd
 
# Step 1: Read the CSV file into a DataFrame
csv_file_path = 'test.csv'
df = pd.read_csv(csv_file_path)
 
# Step 2: Define the values for the new "City" column
new_y_values = y_pred_final
 
# Step 3: Add the new "City" column to the DataFrame
df['FloodProbability'] = new_y_values
 
# Step 4: Write the DataFrame back to the CSV file
df.to_csv(csv_file_path, index=False)

In [33]:
test_id = pd.read_csv("test.csv")
test_id = test_id['id']

submission = pd.DataFrame()
submission["id"] = test_id
submission["FloodProbability"] = y_pred_final

submission.to_csv("submission2.csv")

In [34]:
submission.head()

Unnamed: 0,id,FloodProbability
0,1117957,0.573628
1,1117958,0.455265
2,1117959,0.454758
3,1117960,0.466183
4,1117961,0.466053
