In [None]:
import pandas as pd

file_path = 'data/housing-train-data-6628a4723213d886993351.csv'
data = pd.read_csv(file_path)

data.head()

In [None]:
data.describe()

In [None]:
data.info()

# Suppression de la colonne Unnamed: 0

In [None]:
data_cleaned = data.drop(columns=['Unnamed: 0'])

data_cleaned_head = data_cleaned.head()
data_cleaned_head


In [None]:
%matplotlib inline
data_cleaned.hist(bins=50, figsize=(20, 15))

# Vérification des valeurs manquantes

In [None]:
missing_values = data_cleaned.isnull().sum()

missing_values


# Suppression des valeurs manquantes pour total_bedrooms

In [39]:
data_cleaned = data_cleaned.dropna(subset=['total_bedrooms'])

# Vérification des doublons

In [40]:
duplicates = data_cleaned.duplicated().sum()
duplicates

0

# Exploration

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12, 8))
sns.boxplot(data=data_cleaned[['total_rooms', 'total_bedrooms', 'population', 'households']])
plt.xticks(rotation=45)
plt.grid(True)
plt.show()

In [None]:
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.histplot(data_cleaned['median_income'], bins=30, kde=True, color='blue')
plt.title('Distribution de Median Income')
plt.xlabel('Median Income')
plt.ylabel('Frequency')

plt.subplot(1, 2, 2)
sns.histplot(data_cleaned['median_house_value'], bins=30, kde=True, color='green')
plt.title('Distribution de Median House Value')
plt.xlabel('Median House Value')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x=data_cleaned['median_income'], y=data_cleaned['median_house_value'], alpha=0.5)
plt.title('Scatter Plot de Median House Value vs. Median Income')
plt.xlabel('Median Income')
plt.ylabel('Median House Value')
plt.grid(True)
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x=data_cleaned['ocean_proximity'], y=data_cleaned['median_house_value'])
plt.title('Boxplot of Median House Value by Ocean Proximity')
plt.xlabel('Ocean Proximity')
plt.ylabel('Median House Value')
plt.xticks(rotation=45)
plt.show()


In [None]:
data_cleaned["ocean_proximity"].value_counts()

# Création de 4 colonnes ocean_proximity

In [47]:
data_cleaned = pd.get_dummies(data_cleaned, columns=['ocean_proximity'])


In [None]:
corr = data_cleaned.corr()

plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, fmt=".2f", cmap='coolwarm')
plt.show()


In [None]:
correlation_matrix = data_cleaned.corr()

correlation_matrix

# Localisation

In [None]:
data_cleaned.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1)

In [None]:
data_cleaned.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4,
    s=data_cleaned["population"]/100, label="population", figsize=(10,7),
    c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True,
    sharex=False)
plt.legend()

In [None]:
import folium

sample_data = data_cleaned.sample(n=2000, random_state=42)

map_center = [sample_data['latitude'].mean(), sample_data['longitude'].mean()]
sample_map = folium.Map(location=map_center, zoom_start=6)

for idx, row in sample_data.iterrows():
    folium.CircleMarker(
        location=[row['latitude'], row['longitude']],
        radius=3,
        color='blue',
        fill=True,
        fill_color='blue'
    ).add_to(sample_map)

sample_map


# Localisation par rapport à la valeur des maisons

In [None]:
sample_data = data_cleaned.sample(n=2000, random_state=42)

def color(value):
    if value < 100000:
        return 'green'
    elif value < 200000:
        return 'blue'
    elif value < 300000:
        return 'purple'
    else:
        return 'red'

def radius(value):
    if value < 200000:
        return 5
    else:
        return 10

map_center = [sample_data['latitude'].mean(), sample_data['longitude'].mean()]
sample_map = folium.Map(location=map_center, zoom_start=6)

for idx, row in sample_data.iterrows():
    folium.CircleMarker(
        location=[row['latitude'], row['longitude']],
        radius=radius(row['median_house_value']),
        color=color(row['median_house_value']),
        fill=True,
        fill_color=color(row['median_house_value'])
    ).add_to(sample_map)

sample_map


# Création de samples sur les valeurs plancher

In [54]:
sample_age_over_51 = data_cleaned[data_cleaned['housing_median_age'] > 51].sample(n=250, random_state=200)
sample_value_over_500k = data_cleaned[data_cleaned['median_house_value'] > 500000].sample(n=250, random_state=200)
filtered_data = data_cleaned[(data_cleaned['median_house_value'] <= 500000) & (data_cleaned['housing_median_age'] <= 51)]
data_cleaned2 = pd.concat([filtered_data, sample_age_over_51, sample_value_over_500k])

# Optimisation tables

In [55]:
data_cleaned2['rooms_per_household'] = data_cleaned2['total_rooms'] / data_cleaned2['households']
data_cleaned2['bedrooms_per_room'] = data_cleaned2['total_bedrooms'] / data_cleaned2['total_rooms']

final_data = data_cleaned2.drop(['total_rooms', 'total_bedrooms'], axis=1)

# Création des distances par rapport à LA et SF

In [None]:
import numpy as np

# Fonction pour calculer la distance de Haversine
def haversine(lon1, lat1, lon2, lat2):
    # Convertir les degrés en radians
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    # Différences de longitude et latitude
    dlon = lon2 - lon1
    dlat = lat2 - lat1

    # Formule de Haversine
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    r = 6371  # Rayon de la Terre en kilomètres
    return c * r

# Coordonnées de Los Angeles et San Francisco
la_lon, la_lat = -118.2437, 34.0522
sf_lon, sf_lat = -122.4194, 37.7749

# Création de nouvelles colonnes pour les distances
final_data['distance_to_la'] = final_data.apply(lambda row: haversine(la_lon, la_lat, row['longitude'], row['latitude']), axis=1)
final_data['distance_to_sf'] = final_data.apply(lambda row: haversine(sf_lon, sf_lat, row['longitude'], row['latitude']), axis=1)

# Vérifier les nouvelles colonnes
print(final_data[['distance_to_la', 'distance_to_sf']].head())

# Export final_data dans un CSV

In [57]:
final_data.to_csv('data/final_data.csv', index=False)