In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
import numpy as np

df = pd.read_csv("/kaggle/input/data-dash-cr/train (1).csv")

print(df.info())
print(df.head())
print(df.describe())
print(df.isnull().sum())

sns.countplot(x='Condition', data=df)
plt.title('Distribution of Medical Conditions')
plt.show()

sns.pairplot(df, hue='Condition', diag_kind='kde')
plt.suptitle('Pair Plot of Numerical Features')
plt.show()

correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()

categorical_columns = df.select_dtypes(include=['object']).columns
for column in categorical_columns:
    plt.figure(figsize=(8, 6))
    sns.countplot(x=column, data=df, hue='Condition')
    plt.title(f'Distribution of {column}')
    plt.show()


X_train = df.drop(['username', 'name', 'sex', 'address', 'mail', 'birthdate'], axis=1)
y_train = df['Condition']

model = RandomForestClassifier()

X_train['username'] = X_train['username'].str.lower()
X_train['username'] = X_train['username'].str.replace('[^a-z]', '')
X_train['username'] = pd.to_numeric(X_train['username'], errors='coerce')

test_data = pd.read_csv("/kaggle/input/data-dash-cr/test (1).csv")

model.fit(X_train, y_train)

predictions = model.predict(test_data)


submission = pd.DataFrame({"id": test_data['id'], "Condition": predictions})
submission.to_csv('/kaggle/working/submission.csv', index=False)
submission.head()