In [1]:
import pandas as pd
# pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import f_classif
from sklearn.cluster import KMeans
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from scipy.stats import chi2_contingency

In [2]:
df = pd.read_excel('datasets/FoodInsecurity_Hispanic_Demographics_Tone_Preferences_Dataset.xlsx')

In [3]:
df.head(2)

Unnamed: 0,age,gender,ethnicity,race,education,marital_status,income,employment,language,disability,states,sample_1,sample_2,sample_3,sample_4,sample_5,sample_6,sample_7,sample_8
0,45-54,female,non hispanic,native american,High School,na,"$25,000 - $49,999",Employed Part time,both,i do not have a disability,indiana,Persuasive,Simplier,Empathetic,Persuasive,Original,Original,Persuasive,Original
1,18-24,male,hispanic,white,High School,single,"Less than $25,000",Employed Part time,english,i do not have a disability,illinois,Original,Simplier,Empathetic,Simplier,Simplier,Original,Original,Persuasive


In [4]:
df.drop(['states','disability'], axis=1, inplace=True)

### Reshape data

In [5]:
melted = df.copy()
id_vars = melted.columns.difference(['sample_1', 'sample_2', 'sample_3', 'sample_4', 'sample_5', 'sample_6', 'sample_7', 'sample_8'])
# Melt dataframe
melted_data = pd.melt(melted, id_vars=id_vars, value_vars=['sample_1', 'sample_2', 'sample_3', 'sample_4', 'sample_5', 'sample_6', 'sample_7', 'sample_8'],value_name='tone')
melted_data.drop('variable',axis=1, inplace=True)

In [24]:
melted_data.shape

(400, 10)

## Method 1: Filter

**Objective**: Combine the eight separate tone columns into a single column, allowing us to use this consolidated tone column as the target variable in our model.

In [6]:
X = melted_data.drop('tone', axis=1)
y = melted_data['tone']

In [7]:
# Apply LabelEncoder to categorical columns
label_encoders = {}
for column in X.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X[column] = le.fit_transform(X[column])
    label_encoders[column] = le

In [8]:
x_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder()),  # Encode categorical variables
])
preprocessor = ColumnTransformer(transformers=[('x', x_transformer, X.columns)])

y_transformer = LabelEncoder()

# Fit and transform the target variable
y_train_encoded = y_transformer.fit_transform(y)


In [9]:
# Assuming X and y are now numeric
F_values, p_values = f_classif(X, y)

# Create a DataFrame to view results
results = pd.DataFrame({
    'Feature': X.columns,
    'F-Value': F_values,
    'P-Value': p_values
})

# Select features with significant p-values
significant_features = results[results['P-Value'] < 0.18]

In [10]:
significant_features

Unnamed: 0,Feature,F-Value,P-Value
0,age,2.532329,0.056678
2,employment,1.720474,0.162207
7,marital_status,1.781128,0.150174


In [11]:

# Apply Chi-Square test
chi2_values, p_values = chi2(X, y)

# Create a DataFrame to view results
results = pd.DataFrame({
    'Feature': X.columns,
    'Chi2-Value': chi2_values,
    'P-Value': p_values
})

# Display results
print(results)

# Select features with significant p-values
significant_features = results[results['P-Value'] < 0.05]


          Feature  Chi2-Value   P-Value
0             age    9.999206  0.018573
1       education    2.335436  0.505766
2      employment    8.880465  0.030923
3       ethnicity    1.409831  0.703232
4          gender    0.989650  0.803756
5          income    3.950708  0.266836
6        language    0.943562  0.814905
7  marital_status    5.079575  0.166062
8            race    2.597910  0.457856


In [12]:
results

Unnamed: 0,Feature,Chi2-Value,P-Value
0,age,9.999206,0.018573
1,education,2.335436,0.505766
2,employment,8.880465,0.030923
3,ethnicity,1.409831,0.703232
4,gender,0.98965,0.803756
5,income,3.950708,0.266836
6,language,0.943562,0.814905
7,marital_status,5.079575,0.166062
8,race,2.59791,0.457856


In [13]:
significant_features

Unnamed: 0,Feature,Chi2-Value,P-Value
0,age,9.999206,0.018573
2,employment,8.880465,0.030923


In [14]:
# Use the significant features for clustering
X_significant = X[significant_features['Feature']]

# Apply clustering algorithm
kmeans = KMeans(n_clusters=3, n_init=10)
clusters = kmeans.fit_predict(X_significant)

In [15]:
# Create a contingency table
contingency_table = pd.crosstab(clusters, X['age'])

# Perform Chi-Square test
chi2_stat, p_val, dof, expected = chi2_contingency(contingency_table)

print(f"Chi2 Stat: {chi2_stat}, P-Value: {p_val}")


Chi2 Stat: 424.20227920227916, P-Value: 2.8150726747638574e-83


## Method 2: Recursive Feature Elimination

### Pipeline

In [16]:
categorical_features = X.select_dtypes(include=['object']).columns

In [17]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_features)
    ],
    remainder='passthrough'
)

In [18]:
X_encoded = preprocessor.fit_transform(X)

In [19]:
le = LabelEncoder()
y_encoded = le.fit_transform(y)

In [20]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
rfe = RFE(estimator=model, n_features_to_select=3)  # Adjust as needed

# Fit RFE
rfe.fit(X_encoded, y_encoded)

# Get selected features
feature_names = preprocessor.get_feature_names_out()
selected_features = feature_names[rfe.support_]
print("Selected features:", selected_features)

Selected features: ['remainder__education' 'remainder__employment' 'remainder__race']


In [21]:
# Encode categorical variables
le = LabelEncoder()
df_encoded = melted_data.apply(le.fit_transform)

# Separate features and target
X = df_encoded.drop(columns=['tone'])
y = df_encoded['tone']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [22]:
# Import necessary modules
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
import pandas as pd

# Initialize the RandomForestClassifier model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Initialize RFE with the model and set the number of features to select
rfe = RFE(estimator=model, n_features_to_select=3)

# Fit RFE
rfe.fit(X_train, y_train)

# Get ranking of features
ranking = rfe.ranking_
feature_names = X_train.columns

# Print feature ranking
print("Feature ranking:")
for i in range(len(feature_names)):
    print(f"{feature_names[i]}: {ranking[i]}")

# Select features with rank 1 (most significant)
selected_features = feature_names[rfe.support_]
print("Selected features:", selected_features)

Feature ranking:
age: 3
education: 1
employment: 1
ethnicity: 7
gender: 6
income: 2
language: 5
marital_status: 4
race: 1
Selected features: Index(['education', 'employment', 'race'], dtype='object')


## Forwards Selection

In [23]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression

# Initialize the model
model = LogisticRegression()

# Initialize forward selection
forward_selector = SequentialFeatureSelector(model, n_features_to_select=3, direction='forward')

# Fit the selector
forward_selector.fit(X_train, y_train)

# Get selected features
selected_features = X_train.columns[forward_selector.get_support()]
print("Selected features:", selected_features)


Selected features: Index(['employment', 'marital_status', 'race'], dtype='object')
