# Import Library

In [3]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from sklearn.model_selection import KFold, train_test_split
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import VotingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from keras.layers import Input
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.neural_network import MLPRegressor
from tqdm import tqdm
import joblib
from sklearn.neural_network import MLPClassifier
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

import os

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso

In [4]:
def load_any_tab_wave(file_path,features_columns):
    """
    Upload any file from any wave, verifying that the characteristics relating to social isolation are present
    Args:
        file_path: The path to the .tab file.
    Raises:
        ValueError: If none of the specified columns exist in the file.
    """
    try:
        df = pd.read_csv(file_path, sep='\t', index_col='idauniq', low_memory=False)
    except KeyError:
        try:
          df = pd.read_csv(file_path, sep='\t')
        except pd.errors.EmptyDataError:
          print(f"Error: The file '{file_path}' is empty.")
          return None
        except pd.errors.ParserError:
          print(f"Error: Could not parse the file '{file_path}'. Check the file format.")
          return None
    except FileNotFoundError:
      print(f"Error: The file '{file_path}' was not found.")
      return None
    except pd.errors.EmptyDataError:
        print(f"Error: The file '{file_path}' is empty.")
        return None
    except pd.errors.ParserError:
        print(f"Error: Could not parse the file '{file_path}'. Check the file format.")
        return None

    existing_cols = set(df.columns)
    selected_cols = list(existing_cols.intersection(features_columns))

    if not selected_cols:
      raise ValueError("Error: None of the columns were found in the file.")

    df = df[selected_cols].copy()
    return df

In [5]:
def level_isolation(df):
    """
    Scores ranged from 0 to 5, with higher scores indicating greater social isolation.
    """
    def sum_isolation(row):
        resultado = 0
        
        # unmarried/not cohabiting
        if row['scprt'] == 2:
            resultado += 1
          
        # had less than monthly contact (including face-to-face, telephone, or written/e-mail contact) with children
        if not any(1 <= row[col] <= 3 for col in ['scchdg', 'scchdh', 'scchdj', 'scchdk']):
            resultado += 1
        
        # had less than monthly contact (including face-to-face, telephone, or written/e-mail contact) with other inmediate family
        if not any(1 <= row[col] <= 3 for col in ['scfamg','scfamh','scfamj','scfamk']):
            resultado += 1
        
        # had less than monthly contact (including face-to-face, telephone, or written/e-mail contact) with friend
        if not any(1 <= row[col] <= 3 for col in ['scfrdg','scfrdh','scfrdj','scfrdk']):
            resultado += 1
        
        # not participate in organisations such as social clubs or residents groups, religious groups or committees
        if row['scorg96'] == 1:
            resultado += 1
        
        return resultado

    df['level_social_isolation'] = df.apply(sum_isolation, axis=1)
    
    return df

In [6]:
"""
Characteristics that influence the level of social isolation:

sleep: 'pscedc','sctwuh','sctwum','sctsyh','sctsym','sctwup','sctsyp'

physical activity: 'heacta','heactb','heactc'

health: 'hehelf','heill','helim','helwk','hetemp'

difficulty walking : 'hefunc'

eyesight : 'heeye','hefrnd','hepap'

hearing: 'hehear'

weight: 'heswgh','heswgha'

social participation: 'spcar','spcara','sptraa'
partner interaction: 'scprt'
children interaction: 'scchd','scchdg','scchdh','scchdj','scchdk'
other family interaction: 'scfam','scfamf','scfamx','scfamg','scfamh','scfamj','scfamk'
friend interaction: 'scfrd','scfrdg','scfrdh','scfrdj','scfrdk'
sgroup:'scorgpo','scorgnw','scorgrl','scorgch','scorged','scorgsc','scorgsp','scorg95','scorg96'

"""
features_columns = ['pscedc','sctwuh','sctwum','sctsyh','sctsym','sctwup','sctsyp','heacta','heactb','heactc','hehelf','heill','helim','helwk','hetemp','hefunc','heeye','hefrnd','hepap','hehear','heswgh','heswgha','spcar','spcara','sptraa','scprt','scchd','scchdg','scchdh','scchdj','scchdk','scfam','scfamf','scfamx','scfamg','scfamh','scfamj','scfamk','scfrd','scfrdg','scfrdh','scfrdj','scfrdk','scorgpo','scorgnw','scorgrl','scorgch','scorged','scorgsc','scorgsp','scorg95','scorg96']

In [7]:
data_isolation=load_any_tab_wave("data/dataset/wave_9_elsa_data_eul_v2.tab",features_columns)

In [8]:
data_isolation.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8736 entries, 104178 to 908547
Data columns (total 51 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   spcar    8736 non-null   int64 
 1   scfrdk   8736 non-null   int64 
 2   heacta   8736 non-null   int64 
 3   hehelf   8736 non-null   int64 
 4   scchd    8736 non-null   int64 
 5   sctsym   8736 non-null   int64 
 6   sctsyp   8736 non-null   object
 7   scfrdg   8736 non-null   int64 
 8   hefrnd   8736 non-null   int64 
 9   scfamg   8736 non-null   int64 
 10  heill    8736 non-null   int64 
 11  scorgnw  8736 non-null   int64 
 12  hefunc   8736 non-null   int64 
 13  scorg95  8736 non-null   int64 
 14  scorgch  8736 non-null   int64 
 15  heactb   8736 non-null   int64 
 16  scfam    8736 non-null   int64 
 17  heeye    8736 non-null   int64 
 18  hetemp   8736 non-null   int64 
 19  hepap    8736 non-null   int64 
 20  scfrd    8736 non-null   int64 
 21  scorgpo  8736 non-null   int64 
 22

# Sleep quality

In [9]:
def to_datetime(df,sctwuh,sctwum,sctsyh,sctsym,sctwup,sctsyp):
  """
    Change the colums(hour+minute) to datatime
    Args:
        df: dataframe
        sctwuh,sctwum,sctsyh,sctsym,sctwup,sctsyp: columns that contains the information they all answered persons
    Raises:
        ValueError: If none of the specified columns exist in the file.
  """
  try:  
    sleep = ['sctwuh','sctwum','sctsyh','sctsym']
    df[sleep] = df[sleep].clip(lower=0)
    df.loc[df['sctwup'] == 'PM', 'sctwuh'] += 12
    df.loc[df['sctsyp'] == 'PM', 'sctsyh'] += 12
    df.loc[df['sctsyh'] == 12, 'sctsyh'] = 0
    df['hsleep']=df['sctwuh'] + df['sctwum']/60 - df['sctsyh'] - df['sctsym']/60
    df.loc[df['hsleep'] < 0, 'hsleep'] += 24
    df = df[df['hsleep'] <= 13].copy()
    
    return df
  except (KeyError, TypeError) as e:
    print(f"An error occurred: {e}")
    return df

In [10]:
def group_sleep(row): 
  """
    Hours sleep was further categorized into three groups: 
        -3: hours<6
        -2: 6<=hours<=8
        -1: hours>8
  """    
  if row['hsleep'] < 6:
    return 3
  elif row['hsleep'] > 8:
    return 1
  else:
    return 2

In [11]:
data_isolation=to_datetime(data_isolation,'sctwuh','sctwum','sctsyh','sctsym','sctwup','sctsyp')

In [12]:
data_isolation['gsleep'] = data_isolation.apply(group_sleep, axis=1).copy()

In [13]:
data_isolation.head()

Unnamed: 0_level_0,spcar,scfrdk,heacta,hehelf,scchd,sctsym,sctsyp,scfrdg,hefrnd,scfamg,...,heswgha,sctwup,heswgh,sctwuh,scfrdh,scorg96,helim,scchdg,hsleep,gsleep
idauniq,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
104178,1,2,1,3,1,20,AM,4,4,4,...,3,AM,1,7,2,0,-1,4,7.166667,2
106612,1,1,1,3,1,30,PM,4,2,-1,...,3,AM,2,6,1,0,-1,4,7.0,2
104826,1,4,4,3,1,45,PM,4,2,4,...,3,AM,1,7,4,0,1,4,8.5,1
105887,1,6,4,2,1,30,AM,4,2,4,...,3,AM,1,5,3,1,-1,4,4.666667,3
106269,2,-2,4,-1,-2,0,-2,-2,-1,-2,...,-1,-2,-1,0,-2,-2,1,-2,0.0,3


# Physical Activity

In [14]:
pa=['heacta','heactb','heactc']
data_isolation = data_isolation[(data_isolation[pa] > 0).all(axis=1)].copy()

In [15]:
def group_pa(row): 
  """
    Physical activity was further categorized into three groups: 
        -3: None (no moderate or vigorous activity on a weekly basis)
        -2: Moderate activity at least once a week
        -1: Vigorous activity at least once a week
    Raises:
        ValueError: If none of the specified columns exist in the file.
  """    
  if row['heacta'] <= 2:
    return 1
  elif row['heactb'] <= 2:
    return 2
  else:
    return 3

In [16]:
data_isolation['gpa'] = data_isolation.apply(group_pa, axis=1).copy()

# Self-reported Health

In [17]:
health=['hehelf','heill']
data_isolation = data_isolation[(data_isolation[health] > 0).all(axis=1)]

In [18]:
def group_he(row): 
  """
    Self-reported health was further categorized into 2 groups: 
        -1: good health, comprising those who reported their health as excellent, very good or good
        -2: poor health, comprising fair and poor responses
  """    
  if row['hehelf'] <= 3:
    return 1
  else:
    return 2

In [19]:
data_isolation['ghealt1'] = data_isolation.apply(group_he, axis=1).copy()
data_isolation['ghealt2'] = data_isolation['heill'].replace({1: 2, 2: 1}).copy()

In [20]:
data_isolation.head(10)

Unnamed: 0_level_0,spcar,scfrdk,heacta,hehelf,scchd,sctsym,sctsyp,scfrdg,hefrnd,scfamg,...,sctwuh,scfrdh,scorg96,helim,scchdg,hsleep,gsleep,gpa,ghealt1,ghealt2
idauniq,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
104178,1,2,1,3,1,20,AM,4,4,4,...,7,2,0,-1,4,7.166667,2,1,1,1
106612,1,1,1,3,1,30,PM,4,2,-1,...,6,1,0,-1,4,7.0,2,1,1,1
104826,1,4,4,3,1,45,PM,4,2,4,...,7,4,0,1,4,8.5,1,3,1,2
105887,1,6,4,2,1,30,AM,4,2,4,...,5,3,1,-1,4,4.666667,3,3,1,1
103787,2,-1,4,2,-1,0,,-1,3,-1,...,0,-1,-1,2,-1,0.0,3,2,1,2
105440,1,4,4,2,1,0,PM,4,2,4,...,7,4,0,-1,4,8.5,1,2,1,1
107226,1,-9,4,3,1,45,PM,4,3,4,...,7,3,0,-1,4,7.583333,2,2,1,1
106497,1,2,1,1,2,0,,4,5,4,...,5,2,0,2,-1,5.666667,3,1,1,2
108362,1,1,1,3,1,30,PM,4,1,4,...,6,1,0,2,4,9.0,1,1,1,2
108018,2,1,4,3,1,0,,-9,2,4,...,7,2,0,1,4,7.0,2,2,1,2


# Eyesight

In [21]:
data_isolation = data_isolation[(data_isolation[['heeye']] > 0).all(axis=1)]
data_isolation['heeye'] = data_isolation['heeye'].replace(6, 5).copy()
'''
    The value 6 is low number so it will changed to 5
'''

'\n    The value 6 is low number so it will changed to 5\n'

In [22]:
def group_eye(row): 
  """
    Eyesight was further categorized into three groups: 
        -1: Optimal(Excellent and Very good)
        -2: Good
        -3: Poor(Fair or Poor)
  """    
  if row['heeye'] <= 2:
    return 1
  elif row['heeye'] == 3:
    return 2
  else:
    return 3

In [23]:
data_isolation['geye'] = data_isolation.apply(group_eye, axis=1).copy()

# Hearing

In [24]:
data_isolation = data_isolation[(data_isolation[['hehear']] > 0).all(axis=1)]
data_isolation['hehear'] = data_isolation['hehear'].replace(6, 5).copy()
'''
    The value 6 is low number so it will changed to 5
'''

'\n    The value 6 is low number so it will changed to 5\n'

In [25]:
def group_hear(row): 
  """
    Hearing was further categorized into three groups: 
        -1: Optimal(Excellent and Very good)
        -2: Good
        -3: Poor(Fair or Poor)
  """    
  if row['hehear'] <= 2:
    return 1
  elif row['hehear'] == 3:
    return 2
  else:
    return 3

In [26]:
data_isolation['ghear'] = data_isolation.apply(group_hear, axis=1).copy()

# Difficulty walking

In [27]:
data_isolation = data_isolation[(data_isolation[['hefunc']] > 0).all(axis=1)]
data_isolation['hefunc'] = data_isolation['hefunc'].replace(4, 3).copy()
'''
    The value 4 is low number so it will changed to 3
'''

'\n    The value 4 is low number so it will changed to 3\n'

In [28]:
'''
    Difficulty walking was further categorized into three groups: 
    1: no difficulty
    2: some difficulty
    3: much difficulty
'''
data_isolation['gwdif'] = data_isolation['hefunc'].copy()

# Weight

In [29]:
data_isolation = data_isolation[(data_isolation[['heswgh']] > 0).all(axis=1)]

In [30]:
'''
    Weight was further categorized into three groups: 
    1: light
    2: right
    3: heavy
'''
data_isolation['gwg'] = data_isolation['heswgh'].replace({1: 2, 2: 3, 3: 1}).copy()

# Social Participation

In [31]:
spart=['spcar','spcara','sptraa']
data_isolation[['spcara']] = data_isolation[['spcara']].replace(-1, 0).copy()
data_isolation = data_isolation[(data_isolation[spart] >= 0).all(axis=1)]

# Children Interaction

In [32]:
ch=['scchd','scchdg','scchdh','scchdj','scchdk']
data_isolation = data_isolation[(data_isolation[['scchd']] > 0).all(axis=1)]
'''
'scchdg','scchdh','scchdj','scchdk': assign 6 all values -1 that means it don't has children
'''
data_isolation[ch] = data_isolation[ch].replace(-1, 6).copy()
data_isolation = data_isolation[(data_isolation[ch] > 0).all(axis=1)]

# Other family interaction

In [33]:
of = ['scfam','scfamg','scfamh','scfamj','scfamk']
'''
'scfamg','scfamh','scfamj','scfamk': assign 6 all values -1 that means it don't has other family
'''
data_isolation[of] = data_isolation[of].replace(-1, 6).copy()
data_isolation = data_isolation[(data_isolation[of] > 0).all(axis=1)]

# Friend Interaction

In [34]:
fr = ['scfrd','scfrdg','scfrdh','scfrdj','scfrdk']
'''
'scfrdg','scfrdh','scfrdj','scfrdk': assign 6 all values -1 that means it don't has friend
'''
data_isolation[fr] = data_isolation[fr].replace(-1, 6).copy()
data_isolation = data_isolation[(data_isolation[fr] > 0).all(axis=1)]

# Isolation level

In [35]:
data_isolation=level_isolation(data_isolation).copy()

# Characteristics that influence the level of social isolation:

## Sleep,physical activity,health,difficulty walking,eyesight,hearing,weight,social participation,partner interaction,children interaction,other family interaction,friend interaction

In [None]:
features_isolation=spart+ch+of+fr+['scprt','scorg96','gsleep','gpa','ghealt1','ghealt2','geye','ghear','gwdif','gwg']
X = data_isolation[features_isolation]
y = data_isolation['level_social_isolation']

scaler = StandardScaler()
X = scaler.fit_transform(X)

mlp = MLPRegressor(random_state=42)

param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'sgd'],
    'alpha': [0.0001, 0.001],
    'learning_rate': ['constant', 'adaptive']
}

kf = KFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(mlp, param_grid, cv=kf, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1)

with tqdm(total=100, desc="Entrenamiento") as pbar:
  pbar.update(10)
  grid_search.fit(X, y)
  pbar.update(90)

print("Mejores parámetros:", grid_search.best_params_)

best_mlp = grid_search.best_estimator_

joblib.dump(best_mlp, 'data/models/modelo_mlp.pkl')
joblib.dump(scaler, 'data/models/scaler_mlp.pkl')

In [None]:
features_isolation=spart+ch+of+fr+['scprt','scorg96','gsleep','gpa','ghealt1','ghealt2','geye','ghear','gwdif','gwg']
X = data_isolation[features_isolation]
y = data_isolation['level_social_isolation']

scaler = StandardScaler()
X = scaler.fit_transform(X)

models = {
    'RandomForest': (RandomForestRegressor(), {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20]
    }),
    'LinearRegression': (LinearRegression(), {}),
    'Ridge': (Ridge(), {'alpha': [0.1, 1, 10]}),
    'Lasso': (Lasso(), {'alpha': [0.1, 1, 10]}),
    'MLPRegressor': (MLPRegressor(max_iter=500), {
        'hidden_layer_sizes': [(50,), (100,), (50, 50)],
        'alpha': [0.0001, 0.001, 0.01]
    })
}

best_models = {}
for name, (model, params) in models.items():
  grid_search = GridSearchCV(model, params, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
  grid_search.fit(X, y)
  best_models[name] = grid_search.best_estimator_
  print(f'Mejor modelo {name}: {grid_search.best_params_}')

voting_regressor = VotingRegressor(estimators=[(name, model) for name, model in best_models.items()])
voting_regressor.fit(X, y)

joblib.dump(voting_regressor, 'data/models/modelo_vt.pkl')
joblib.dump(scaler, 'data/models/scaler_vt.pkl')

# Classifier

In [None]:
features_isolation=spart+ch+of+fr+['scprt','scorg96','gsleep','gpa','ghealt1','ghealt2','geye','ghear','gwdif','gwg']
X = data_isolation[features_isolation]
y = data_isolation['level_social_isolation']


scaler = StandardScaler()
X = scaler.fit_transform(X)

parameter_space = {
    'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}

mlp = MLPClassifier(max_iter=1000)

clf = GridSearchCV(mlp, parameter_space, n_jobs=-1, cv=5, scoring='f1_macro', refit='f1_macro')
clf.fit(X, y)

In [None]:
joblib.dump(clf, 'data/models/mlp_cl.pkl')
joblib.dump(scaler, 'data/models/scalermlp_cl.pkl')

In [None]:
features_isolation=spart+ch+of+fr+['scprt','scorg96','gsleep','gpa','ghealt1','ghealt2','geye','ghear','gwdif','gwg']
X = data_isolation[features_isolation]
y = data_isolation['level_social_isolation']

# Escalar los datos
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Diccionario para almacenar los mejores modelos
best_models = {}

# Hiperparámetros y clasificadores individuales
param_grids = {
    'lr': {
        'C': [0.1, 1, 10]
    },
    'svm': {
        'C': [0.1, 1, 10],
        'gamma': ['scale', 'auto']
    },
    'dt': {
        'max_depth': [None, 5, 10]
    },
    'knn': {
        'n_neighbors': [3, 5, 7]
    },
    'mlp': {
        'hidden_layer_sizes': [(10,), (50,), (100,)],
        'activation': ['relu', 'tanh']
    }
}

classifiers = {
    'lr': LogisticRegression(solver='liblinear', random_state=42),
    'svm': SVC(random_state=42),
    'dt': DecisionTreeClassifier(random_state=42),
    'knn': KNeighborsClassifier(),
    'mlp': MLPClassifier(max_iter=1000, random_state=42)
}

# Paso 1: Buscar los mejores hiperparámetros para cada modelo
for name, clf in classifiers.items():
    print(f"Optimizing {name}...")
    grid_search = GridSearchCV(clf, param_grids[name], cv=3, n_jobs=-1, scoring='accuracy')
    grid_search.fit(X_scaled, y)
    best_models[name] = grid_search.best_estimator_
    print(f"Best parameters for {name}: {grid_search.best_params_}")

# Paso 2: Crear el VotingClassifier con los mejores modelos
voting_clf = VotingClassifier(
    estimators=[
        ('lr', best_models['lr']),
        ('svm', best_models['svm']),
        ('dt', best_models['dt']),
        ('knn', best_models['knn']),
        ('mlp', best_models['mlp'])
    ],
    voting='hard'  # Puedes cambiar a 'soft' si tienes probabilidades disponibles
)

# Entrenar el VotingClassifier
voting_clf.fit(X_scaled, y)

# Paso 3: Guardar el modelo y el scaler
os.makedirs('data/models', exist_ok=True)
joblib.dump(voting_clf, 'data/models/voting_clf.pkl')
joblib.dump(scaler, 'data/models/scaler_voting_clf.pkl')

print("VotingClassifier and scaler saved successfully!")