# Census Income

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [12]:
# Step 1: Load the dataset
df = pd.read_csv(r'C:\Users\write\census_income.csv')
df

Unnamed: 0,Age,Workclass,Fnlwgt,Education,Education_num,Marital_status,Occupation,Relationship,Race,Sex,Capital_gain,Capital_loss,Hours_per_week,Native_country,Income
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32555,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32556,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32557,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32558,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [15]:
# Step 1: Load the dataset
url = 'C:/Users/write/census_income.csv'
data = pd.read_csv(url)

In [17]:
# Step 2: Preprocess the data
# Drop irrelevant columns
data.drop(['Fnlwgt'], axis=1, inplace=True)

In [20]:
# Convert categorical variables to numerical using LabelEncoder
label_encoder = LabelEncoder()
categorical_columns = ['Workclass', 'Education', 'Marital_status', 'Occupation', 'Relationship', 'Race', 'Sex', 'Native_country', 'Income']
for col in categorical_columns:
    data[col] = label_encoder.fit_transform(data[col])

In [22]:
# Step 3: Split the data into features and target variable
X = data.drop('Income', axis=1)
y = data['Income']

In [23]:
# Step 4: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [24]:
# Step 5: Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [25]:
# Step 6: Build and train the model
model = RandomForestClassifier()
model.fit(X_train, y_train)


RandomForestClassifier()

In [26]:
# Step 7: Evaluate the model
y_pred = model.predict(X_test)


In [27]:
accuracy = accuracy_score(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

In [28]:
print("Accuracy:", accuracy)
print("Confusion Matrix:\n", confusion_mat)
print("Classification Report:\n", classification_rep)

Accuracy: 0.8511977886977887
Confusion Matrix:
 [[4506  406]
 [ 563 1037]]
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.92      0.90      4912
           1       0.72      0.65      0.68      1600

    accuracy                           0.85      6512
   macro avg       0.80      0.78      0.79      6512
weighted avg       0.85      0.85      0.85      6512



# Rainfall Weather Forecasting

In [29]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, mean_squared_error

In [46]:
# Step 1: Load the dataset
url = 'https://raw.githubusercontent.com/dsrscientist/dataset3/main/weatherAUS.csv'
data = pd.read_csv(url)


In [47]:
# Step 2: Preprocess the data
# Drop irrelevant columns and handle missing values
data.dropna(subset=['RainTomorrow'], inplace=True)
data.drop(['Date', 'Location'], axis=1, inplace=True)

In [48]:
# Convert categorical variables to numerical using LabelEncoder
label_encoder = LabelEncoder()
categorical_columns = ['WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainTomorrow']
for col in categorical_columns:
    data[col] = label_encoder.fit_transform(data[col].astype(str))

In [49]:
# Step 3: Split the data into features and target variable
X = data.drop(['RainTomorrow', 'Rainfall'], axis=1)
y_class = data['RainTomorrow']
y_reg = data['Rainfall']

In [50]:
# Step 4: Split the data into training and testing sets
X_train, X_test, y_class_train, y_class_test, y_reg_train, y_reg_test = train_test_split(
    X, y_class, y_reg, test_size=0.2, random_state=42)

In [51]:
# Step 5: Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

ValueError: could not convert string to float: 'No'

# Insurance Claim Fraud Detection

In [52]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [53]:
# Step 1: Load the dataset
url = 'https://raw.githubusercontent.com/dsrscientist/Data-Science-ML-Capstone-Projects/master/Automobile_insurance_fraud.csv'
data = pd.read_csv(url)

In [54]:
# Step 2: Preprocess the data
# Drop irrelevant columns and handle missing values
data.drop(['policy_number', 'policy_bind_date', '_c39'], axis=1, inplace=True)
data.dropna(inplace=True)


In [55]:
# Convert categorical variables to numerical using LabelEncoder
label_encoder = LabelEncoder()
categorical_columns = ['policy_state', 'policy_csl', 'insured_sex', 'insured_education_level', 
                       'insured_occupation', 'insured_hobbies', 'insured_relationship', 
                       'incident_date', 'incident_type', 'collision_type', 'incident_severity', 
                       'authorities_contacted', 'incident_state', 'incident_city', 'incident_location', 
                       'property_damage', 'police_report_available', 'auto_make', 'auto_model']
for col in categorical_columns:
    data[col] = label_encoder.fit_transform(data[col])

In [56]:
# Step 3: Split the data into features and target variable
X = data.drop('fraud_reported', axis=1)
y = data['fraud_reported']

In [57]:
# Step 4: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [58]:
# Step 5: Build and train the classification model
classification_model = RandomForestClassifier()
classification_model.fit(X_train, y_train)

RandomForestClassifier()

In [59]:
# Step 6: Evaluate the model
y_pred = classification_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

In [60]:
print("Model Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred))

Model Accuracy: 0.755
Classification Report:
              precision    recall  f1-score   support

           N       0.79      0.90      0.84       145
           Y       0.58      0.38      0.46        55

    accuracy                           0.76       200
   macro avg       0.69      0.64      0.65       200
weighted avg       0.74      0.76      0.74       200

