In [1]:
import pandas as pd

In [4]:
df = pd.read_excel('datasets/FoodInsecurity_Hispanic_Demographics_Tone_Preferences_Dataset.xlsx')

In [5]:
df.head(2)

Unnamed: 0,age,gender,ethnicity,race,education,marital_status,income,employment,language,disability,states,sample_1,sample_2,sample_3,sample_4,sample_5,sample_6,sample_7,sample_8
0,45-54,female,non hispanic,native american,High School,na,"$25,000 - $49,999",Employed Part time,both,i do not have a disability,indiana,Persuasive,Simplier,Empathetic,Persuasive,Original,Original,Persuasive,Original
1,18-24,male,hispanic,white,High School,single,"Less than $25,000",Employed Part time,english,i do not have a disability,illinois,Original,Simplier,Empathetic,Simplier,Simplier,Original,Original,Persuasive


In [6]:
df.columns

Index(['age', 'gender', 'ethnicity', 'race', 'education', 'marital_status',
       'income', 'employment', 'language', 'disability', 'states', 'sample_1',
       'sample_2', 'sample_3', 'sample_4', 'sample_5', 'sample_6', 'sample_7',
       'sample_8'],
      dtype='object')

In [31]:
melted = df.copy()
id_vars = melted.columns.difference(['sample_1', 'sample_2', 'sample_3', 'sample_4', 'sample_5', 'sample_6', 'sample_7', 'sample_8'])
# Melt dataframe
melted_data = pd.melt(melted, id_vars=id_vars, value_vars=['sample_1', 'sample_2', 'sample_3', 'sample_4', 'sample_5', 'sample_6', 'sample_7', 'sample_8'],value_name='tone')
melted_data.drop('variable',axis=1, inplace=True)

In [32]:
melted_data.head(2)

Unnamed: 0,age,disability,education,employment,ethnicity,gender,income,language,marital_status,race,states,tone
0,45-54,i do not have a disability,High School,Employed Part time,non hispanic,female,"$25,000 - $49,999",both,na,native american,indiana,Persuasive
1,18-24,i do not have a disability,High School,Employed Part time,hispanic,male,"Less than $25,000",english,single,white,illinois,Original


In [34]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [35]:
categorical_features = X.select_dtypes(include=['object']).columns

In [36]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_features)
    ],
    remainder='passthrough'
)

In [37]:
X_encoded = preprocessor.fit_transform(X)

In [38]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_encoded = le.fit_transform(y)

In [39]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, random_state=42)
rfe = RFE(estimator=model, n_features_to_select=10)  # Adjust as needed

# Fit RFE
rfe.fit(X_encoded, y_encoded)

# Get selected features
feature_names = preprocessor.get_feature_names_out()
selected_features = feature_names[rfe.support_]
print("Selected features:", selected_features)

Selected features: ['cat__age_25-34' 'cat__education_Associate'
 'cat__employment_Employed Part time' 'cat__gender_female'
 'cat__income_$25,000 - $49,999' 'cat__language_english'
 'cat__marital_status_married' 'cat__race_white' 'cat__states_illinois'
 'cat__states_new jersey']


In [42]:
from sklearn.model_selection import train_test_split


In [43]:
# Encode categorical variables
le = LabelEncoder()
df_encoded = melted_data.apply(le.fit_transform)

# Separate features and target
X = df_encoded.drop(columns=['tone'])
y = df_encoded['tone']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [45]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

# Initialize the Logistic Regression model
model = LogisticRegression(max_iter=1000)

# Initialize RFE with the model and set the number of features to select
rfe = RFE(estimator=model, n_features_to_select=1)

# Fit RFE
rfe.fit(X_train, y_train)

# Get ranking of features
ranking = rfe.ranking_
feature_names = X.columns

# Print feature ranking
print("Feature ranking:")
for i in range(len(feature_names)):
    print(f"{feature_names[i]}: {ranking[i]}")

# Select features with rank 1 (most significant)
selected_features = feature_names[rfe.support_]
print("Selected features:", selected_features)

Feature ranking:
age: 10
disability: 1
education: 8
employment: 4
ethnicity: 2
gender: 3
income: 5
language: 7
marital_status: 9
race: 11
states: 6
Selected features: Index(['disability'], dtype='object')


In [46]:
# Create a DataFrame for feature rankings
ranking_df = pd.DataFrame({'Feature': X.columns, 'Ranking': rfe.ranking_})
ranking_df.sort_values(by='Ranking', inplace=True)
print(ranking_df)

           Feature  Ranking
1       disability        1
4        ethnicity        2
5           gender        3
3       employment        4
6           income        5
10          states        6
7         language        7
2        education        8
8   marital_status        9
0              age       10
9             race       11
