In [34]:
import pandas as pd
import numpy as np
from pycaret.classification import *
import os
from sklearn.metrics import roc_auc_score, roc_curve, auc, confusion_matrix, classification_report,average_precision_score, precision_recall_curve
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
import pickle
import geopandas as gpd
import sweetviz as sv

In [35]:
# reading dataset
data = pd.read_pickle('../data/manipulated/geo_city.pkl')
# to have a non spatial modeling we drop spatial features keeping longitude and latitude for the buildings    
data.head()
data.drop(columns=['geometry'], axis=1, inplace=True)

#turning to pandas dataframe
data = pd.DataFrame(data)



In [36]:
#pycaret recognize some columns as duplicate. in order we decide which ones could be, we check all the columns
def make_columns_distinct(df):
    """A function that makes all column names distinct."""
    modified_columns = []
    seen_columns = set()

    for column in df.columns:
        modified_column = column
        counter = 1

        while modified_column in seen_columns:
            modified_column = f"{column}_{counter}"
            counter += 1

        modified_columns.append(modified_column)
        seen_columns.add(modified_column)

    df.columns = modified_columns
    return df


# Make the columns distinct
df_distinct = make_columns_distinct(data)

# Print the modified column names
print("Modified column names:")
print(df_distinct.columns)

Modified column names:
Index(['obj_type', 'name', 'info', 'damage_gra', 'det_method', 'notation',
       'or_src_id', 'dmg_src_id', 'cd_value', 'real', 'province', 'city',
       'population', 'income', 'total_sales', 'second_sales', 'water_access',
       'elec_cons', 'building_perm', 'land_permited', 'labour_fource',
       'unemployment', 'agricultural', 'life_time', 'hb_per100000',
       'fertility', 'hh_size', 'point'],
      dtype='object')


In [38]:
#columns with geometry
#df_distinct.drop(columns=['Geometry','centroid'], axis=1, inplace=True)                               

In [None]:
# a quick EDA with sweetviz
#anaylsis = sv.analyze(df_distinct)
#anaylsis.show_html('df_distinct.html')

In [39]:
def clean_data(data):
    """A  function that cleans the data."""
    data = data.dropna()
    data = data.drop_duplicates()
    return data


def remove_duplicate_columns(data):
    """ A function that removes duplicate columns from a DataFrame or a 2D list."""
    if isinstance(data, pd.DataFrame):
        # Remove duplicate columns from a DataFrame
        return data.loc[:, ~data.columns.duplicated()]
    elif isinstance(data, list) and all(isinstance(row, list) for row in data):
        # Remove duplicate columns from a 2D list
        transposed = list(map(list, zip(*data)))  # Transpose the list
        deduplicated = [list(t) for t in set(tuple(row) for row in transposed)]  # Deduplicate
        return list(map(list, zip(*deduplicated)))  # Transpose back
    else:
        raise ValueError("Input must be a DataFrame or a 2D list")

clean_data(df_distinct)
remove_duplicate_columns(df_distinct)
pd.set_option('display.max_columns', None)
column_names = data.columns.tolist()


for column in df_distinct.columns:
    print(column)

obj_type
name
info
damage_gra
det_method
notation
or_src_id
dmg_src_id
cd_value
real
province
city
population
income
total_sales
second_sales
water_access
elec_cons
building_perm
land_permited
labour_fource
unemployment
agricultural
life_time
hb_per100000
fertility
hh_size
point


In [40]:
#ordinalization of target vaiable so that pycaret can use it for its models

mapping = {
    "Damaged": 3,
    "Destroyed": 4,
    "No visible damage": 1,
    "Possibly damaged": 2
}

df_distinct['damage_gra'] = df_distinct['damage_gra'].replace(mapping)



In [42]:
#removing some columns as ve observe as do not have any effect on the target variable some
#df_distinct.drop(columns=['Okul Türü', 'Okul Türü_y', 'Okul Türü_x', 'name', 'cd_value', 'İlçeler', 'Kayıtlı Nüfus'], axis=1, inplace=True)


In [None]:
#defining categorical and numerical features
categorical_features = ['obj_type','info', 'notation','or_src_id', 'Province','Municipio',]
numeric_features = ['population','income','total_sales', 'second_sales', 'water_access', 'elec_cons', 'building_perm',
           'land_permited', 'labour_fource','unemployment','agricultural', 'life_time', 'hb_per100000', 'fertility',
           'hh_size','latitude','longitude', 'nearest_fault_distance_km','nearest_eq_distance_km',
           'Longitude','Latitude', 'eclass_public','eclass_private','eclass_total','enumber_public','enumber_private',
           'enumber_total', 'e_male_public', 'e_female_public', 'e_total_public', 'e_male_private', 'e_female_private',
           'e_total_private','e_total','et_male_public','et_female_public', 'et_total_public','et_male_private', 
           'et_total_private', 'et_total', 'hclass_public','hclass_private','hclass_total','hnumber_public',
           'hnumber_private', 'hnumber_total','h_male_public','h_female_public','h_total_public','h_male_private',
           'h_female_private','h_total_private','h_total','hs_religious_male_public','hs_religious_female_public',
           'hs_religious_total_public', 'hs_religious_male_private', 'hs_religious_female_private', 'hs_religious_total_private',
           'hs_religious_total','h_male_occupational_public','h_female_occupational_public','h_total_occupational_public',
           'h_male_occupational_private','h_male_occupational_private_1','h_total_ocupational_private','h_occupational_total',
           'h_male_normal_public','h_female_normal_public','h_total_normal_public','h_male_normal_private','h_female_normal_private',
           'h_total_normal_private','h_normal_total','ht_male_public','ht_female_public','ht_total_public','ht_male_private',
           'ht_female_private','ht_total_private','ht_toplam','mclass_public','mclass_private','mclass_total','mnumber_public',
           'mnumber_private','mnumber_total','m_male_public','m_female_public','m_total_public','m_male_private','m_female_private',
           'm_total_private','m_total','mt_male_public','mt_female_public','mt_total_public','mt_male_private','mt_female_private',
           'mt_total_private','mt_total','pclass_public','pclass_private','pclass_toplam','pnumber_public','pnumber_private',
           'pnumber_total','p_male_public','p_female_public','p_total_public','p_male_private','p_female_private','p_total_private',
           "p_total",'pt_male_public','pt_female_public','pt_total_public','pt_male_private','pt_female_private','pt_total_private',
           'pt_total',
           ]


In [43]:


df_distinct.dropna(inplace=True)            
           

In [48]:
from pycaret.classification import setup, create_model


selection = setup(data=df_distinct, 
                  target='damage_gra',
                  train_size=0.7, 
                  normalize=True, 
                  transformation=True,
                  session_id=123,  # for reproducibility
                  fold_shuffle=True , # activate shuffling,
                  transformed_feature_names='get_feature_names_out'
                 )


# Create Ridge Regression model
ridge_model = create_model('ridge')

# Get feature importance
feature_importance = ridge_model.coef_

# Get column names
column_names = df_distinct.columns

# Create a DataFrame to display feature importance
feature_importance_df = pd.DataFrame({'Variable': column_names, 'Importance': feature_importance})

# Sort by importance (absolute values)
feature_importance_df['Importance_abs'] = abs(feature_importance_df['Importance'])
feature_importance_df = feature_importance_df.sort_values('Importance_abs', ascending=False).reset_index(drop=True)

# Print the selected variables
print("Selected Variables:")
print(feature_importance_df)


TypeError: setup() got an unexpected keyword argument 'transformed_feature_names'

In [None]:
# Create Ridge Regression model
ridge_model = create_model('ridge')
lasso_model = create_model('lasso')


In [None]:

# Get feature importance
feature_importance = ridge_model.coef_

# Get column names
column_names = df_distinct.columns

print(len(column_names))
print(len(feature_importance))



In [None]:

# Create a DataFrame to display feature importance
feature_importance_df = pd.DataFrame({'Variable': column_names, 'Importance': feature_importance})

# Sort by importance (absolute values)
feature_importance_df['Importance_abs'] = abs(feature_importance_df['Importance'])
feature_importance_df = feature_importance_df.sort_values('Importance_abs', ascending=False).reset_index(drop=True)

# Print the selected variables
print("Selected Variables:")
print(feature_importance_df)

In [None]:

# Set up the PyCaret environment with preprocessing
s = ClassificationExperiment()
s = setup(df_distinct, target='damage_gra', session_id=42, preprocess=True, normalize=True, train_size=0.8,)
s.eda

In [None]:


best_model = s.compare_models()
best_model

In [None]:
evaluate_model(best_model)

In [None]:
plot_model(best, plot = 'auc')

In [None]:
plot_model(best, plot = 'auc')

In [None]:

# Train the best model
trained_model = finalize_model(best_model)


In [None]:
# Get the train and test datasets
X_train = get_config('X_train')
X_test = get_config('X_test')
y_train = get_config('y_train')
y_test = get_config('y_test')

X_test

In [None]:
new_data = []
# Make predictions on new data
predictions = predict_model(trained_model, data= X_test)

In [None]:
# Assuming you have already trained and finalized a model
trained_model = load_model('path_to_trained_model')

# Plot the confusion matrix
plot_model(trained_model, plot='confusion_matrix')


In [None]:
db.describe()


In [None]:
print(db.info(),)


In [None]:
# Convert the GeoDataFrame to a DataFrame
df = pd.DataFrame(db)

# The 'geometry' column is automatically converted to a string representation of the geometry object,
# so you may want to convert it back to a GeoSeries using geopandas' 'from_wkt()' function:
from shapely.wkt import loads
df['geometry'] = df['geometry'].apply(lambda x: loads(x.wkt))

# Print the resulting DataFrame
df

In [None]:

clf1 = setup(data = df, target = 'damage_gra')

In [None]:
clf1 = setup (data = df, target = 'damage_gra', categorical_features = category, numeric_features = numerical)