In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

ModuleNotFoundError: No module named 'sklearn'

In [None]:
df = pd.read_csv("data/insurance.csv")
df.head()

In [None]:
print("Dataset Shape:", df.shape)
print("\nColumn Info:")
print(df.info())
print("\nMissing Values:")
print(df.isnull().sum())
print("\nBasic Statistics:")
print(df.describe())

In [None]:
num_cols = df.select_dtypes(include=['number']).columns.tolist()
cat_cols = df.select_dtypes(include=['object']).columns.tolist()

print("Numeric columns:", num_cols)
print("Categorical columns:", cat_cols)

for col in cat_cols:
    print(f"\n{col}: {df[col].unique()}")
    print(f"Value counts:\n{df[col].value_counts()}")

In [None]:
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
plt.hist(df['charges'], bins=30, alpha=0.7, color='skyblue')
plt.title('Distribution of Charges')
plt.xlabel('Charges ($)')
plt.ylabel('Frequency')

plt.subplot(1, 3, 2)
sns.boxplot(y=df['charges'], color='lightcoral')
plt.title('Charges Box Plot')

plt.subplot(1, 3, 3)
plt.hist(np.log(df['charges']), bins=30, alpha=0.7, color='lightgreen')
plt.title('Log Distribution of Charges')
plt.xlabel('Log(Charges)')

plt.tight_layout()
plt.show()

print("Charges Statistics:")
print(f"Mean: ${df['charges'].mean():.2f}")
print(f"Median: ${df['charges'].median():.2f}")
print(f"Std: ${df['charges'].std():.2f}")
print(f"Min: ${df['charges'].min():.2f}")
print(f"Max: ${df['charges'].max():.2f}")

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Sex vs Charges
sns.boxplot(data=df, x='sex', y='charges', ax=axes[0,0])
axes[0,0].set_title('Charges by Gender')

# Smoker vs Charges
sns.boxplot(data=df, x='smoker', y='charges', ax=axes[0,1])
axes[0,1].set_title('Charges by Smoking Status')

# Region vs Charges
sns.boxplot(data=df, x='region', y='charges', ax=axes[1,0])
axes[1,0].set_title('Charges by Region')
axes[1,0].tick_params(axis='x', rotation=45)

# Children vs Charges
sns.boxplot(data=df, x='children', y='charges', ax=axes[1,1])
axes[1,1].set_title('Charges by Number of Children')

plt.tight_layout()
plt.show()

In [None]:
# Correlation matrix
plt.figure(figsize=(5, 4))
correlation_matrix = df[num_cols].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, fmt='.3f')
plt.title('Correlation Matrix')
plt.show()

# Scatter plots
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Age vs Charges
sns.scatterplot(data=df, x='age', y='charges', hue='smoker', ax=axes[0])
axes[0].set_title('Age vs Charges (by Smoking Status)')

# BMI vs Charges
sns.scatterplot(data=df, x='bmi', y='charges', hue='smoker', ax=axes[1])
axes[1].set_title('BMI vs Charges (by Smoking Status)')

# Age vs BMI
sns.scatterplot(data=df, x='age', y='bmi', hue='smoker', ax=axes[2])
axes[2].set_title('Age vs BMI (by Smoking Status)')

plt.tight_layout()
plt.show()

In [None]:
df['age_group'] = pd.cut(df['age'], 
                        bins=[17, 30, 40, 50, 65], 
                        labels=['18-30', '31-40', '41-50', '51-64'])

plt.figure(figsize=(18, 6))

# 1. Box Plot - Age Groups
plt.subplot(1, 3, 1)
sns.boxplot(data=df, x='age_group', y='charges', hue='smoker', 
            palette='Set2')
plt.title('Insurance Charges by Age Group\n(Smoking Status Comparison)', 
          fontsize=12, pad=20)
plt.xlabel('Age Group')
plt.ylabel('Charges ($)')
plt.legend(title='Smoker')

# 2. Smooth Line - Overall Trend
plt.subplot(1, 3, 2)
sns.regplot(data=df, x='age', y='charges', scatter_kws={'alpha':0.3, 's':20}, 
            line_kws={'color':'red'})
plt.title('Age vs Charges\n(Overall Trend with Regression Line)', 
          fontsize=12, pad=20)
plt.xlabel('Age')
plt.ylabel('Charges ($)')
plt.grid(True, alpha=0.3)

# 3. Separated Lines for Smokers
plt.subplot(1, 3, 3)
for smoker_status in ['no', 'yes']:
    subset = df[df['smoker'] == smoker_status]
    age_avg = subset.groupby('age')['charges'].mean()
    
    label = 'Non-smoker' if smoker_status == 'no' else 'Smoker'
    color = 'lightblue' if smoker_status == 'no' else 'salmon'
    
    plt.plot(age_avg.index, age_avg.values, marker='o', 
             linewidth=2, label=label, color=color, alpha=0.8, markersize=4)

plt.title('Average Charges by Age\n(Smoker vs Non-smoker)', 
          fontsize=12, pad=20)
plt.xlabel('Age')
plt.ylabel('Average Charges ($)')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
def bmi_category(bmi):
    if bmi < 18.5:
        return 'Underweight'
    elif bmi < 25:
        return 'Normal'
    elif bmi < 30:
        return 'Overweight'
    else:
        return 'Obese'

df['bmi_category'] = df['bmi'].apply(bmi_category)

plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
sns.countplot(data=df, x='bmi_category', hue='smoker')
plt.title('Distribution of BMI Categories by Smoking Status')
plt.xticks(rotation=45)

plt.subplot(1, 2, 2)
sns.boxplot(data=df, x='bmi_category', y='charges', hue='smoker')
plt.title('Charges by BMI Category and Smoking Status')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

In [None]:
print("=== SMOKING STATUS ANALYSIS ===")
smoking_stats = df.groupby('smoker')['charges'].agg(['mean', 'median', 'std', 'count'])
print(smoking_stats)

print("\n=== GENDER ANALYSIS ===")
gender_stats = df.groupby('sex')['charges'].agg(['mean', 'median', 'std', 'count'])
print(gender_stats)

print("\n=== REGION ANALYSIS ===")
region_stats = df.groupby('region')['charges'].agg(['mean', 'median', 'std', 'count'])
print(region_stats)

print("\n=== CHILDREN ANALYSIS ===")
children_stats = df.groupby('children')['charges'].agg(['mean', 'median', 'std', 'count']).T
print(children_stats)

In [None]:
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
sns.boxplot(y=df['charges'])
plt.title('Charges Outliers')

plt.subplot(1, 3, 2)
sns.boxplot(y=df['age'])
plt.title('Age Distribution')

plt.subplot(1, 3, 3)
sns.boxplot(y=df['bmi'])
plt.title('BMI Outliers')

plt.tight_layout()
plt.show()

Q1 = df['charges'].quantile(0.25)
Q3 = df['charges'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = df[(df['charges'] < lower_bound) | (df['charges'] > upper_bound)]
print(f"Number of outliers in charges: {len(outliers)} ({len(outliers)/len(df)*100:.1f}%)")

In [None]:
# Genel özet dashboard
fig, axes = plt.subplots(3, 3, figsize=(18, 15))

# 1. Charges distribution
sns.histplot(df['charges'], bins=30, ax=axes[0,0])
axes[0,0].set_title('Charges Distribution')

# 2. Smoking effect
sns.boxplot(data=df, x='smoker', y='charges', ax=axes[0,1])
axes[0,1].set_title('Smoking vs Charges')

# 3. Age vs Charges
sns.scatterplot(data=df, x='age', y='charges', hue='smoker', ax=axes[0,2])
axes[0,2].set_title('Age vs Charges')

# 4. BMI vs Charges
sns.scatterplot(data=df, x='bmi', y='charges', hue='smoker', ax=axes[1,0])
axes[1,0].set_title('BMI vs Charges')

# 5. Gender distribution
df['sex'].value_counts().plot(kind='pie', ax=axes[1,1], autopct='%1.1f%%')
axes[1,1].set_title('Gender Distribution')

# 6. Region distribution
sns.countplot(data=df, x='region', ax=axes[1,2])
axes[1,2].set_title('Region Distribution')
axes[1,2].tick_params(axis='x', rotation=45)

# 7. Children distribution
sns.countplot(data=df, x='children', ax=axes[2,0])
axes[2,0].set_title('Children Distribution')

# 8. Smoker distribution
df['smoker'].value_counts().plot(kind='pie', ax=axes[2,1], autopct='%1.1f%%')
axes[2,1].set_title('Smoker Distribution')

# 9. Correlation heatmap
sns.heatmap(df[num_cols].corr(), annot=True, fmt='.2f', ax=axes[2,2], cbar=False)
axes[2,2].set_title('Correlation Matrix')

plt.tight_layout()
plt.show()

In [None]:
df['sex'] = df['sex'].replace({'female': 1, 'male': 2})
df['smoker'] = df['smoker'].replace({'yes': 1, 'no': 2})
df['region'] = df['region'].replace({'southwest': 1, 'southeast': 2, 'northwest': 3, 'northeast': 4})
df.head()

In [None]:
df.columns

In [None]:
X = df[[
    'age', 'sex', 'bmi', 'children', 'smoker', 'region',
]]
y = df['charges']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 1)

scaler = StandardScaler()
X_train_scaler = scaler.fit_transform(X_train)
X_test_scaler = scaler.transform(X_test)

lin = LinearRegression()

poly = PolynomialFeatures(degree=6)
X_poly_train = poly.fit_transform(X_train_scaler)
X_test_poly = poly.transform(X_test_scaler)
poly.fit(X_poly_train, y_train)
lin.fit(X_poly_train, y_train)

In [None]:
y_pred = lin.predict(X_test_poly)

In [None]:
mean_absolute_error(y_test, y_pred)

In [None]:
y_pred_train = lin.predict(X_poly_train)
mean_absolute_error(y_train, y_pred_train)

In [None]:
poly = PolynomialFeatures(degree=3)
X_poly_train = poly.fit_transform(X_train_scaler)
X_test_poly = poly.transform(X_test_scaler)
poly.fit(X_poly_train, y_train)
lin = LinearRegression()
lin.fit(X_poly_train, y_train)

In [None]:
y_pred = lin.predict(X_test_poly)
mean_absolute_error(y_test, y_pred)

In [None]:
y_pred_train = lin.predict(X_poly_train)
mean_absolute_error(y_train, y_pred_train)

In [None]:
sns.scatterplot(x=X, y=y_pred)