In [1]:
import importlib

def check_and_install(module_name):
    try:
        importlib.import_module(module_name)
        print(f"{module_name} is already installed.")
    except ImportError:
        print(f"{module_name} is not installed. Installing...")
        import subprocess
        subprocess.check_call(["pip", "install", module_name])

# List of required modules
required_modules = [
    "pandas",
    "numpy",
    "sqlalchemy",
    "geoalchemy2",
    "shapely"
]

# Check and install each required module
for module in required_modules:
    check_and_install(module)

pandas is already installed.
numpy is already installed.
sqlalchemy is already installed.
geoalchemy2 is already installed.
shapely is already installed.


In [2]:
import pandas as pd
import numpy as np
from sqlalchemy import Column, String, Text, Integer, select, Date, create_engine, inspect
from geoalchemy2 import Geometry
from sqlalchemy.orm import sessionmaker
from sqlalchemy.ext.declarative import declarative_base
import logging
from shapely.wkb import loads


In [3]:
user_name = 'l01-4'
#use the file way of storing password or set passw to your password
with open("l01-4_password.txt") as f:
    passw = f.read()

# Calgary Census Data
## Source: City of Calgary's Open Data Portal
Changelog for Data Export to SQL Table

*From Civic Census by Dwelling and Community*
- Include CENSUS_YEAR	COMM_CODE	DWELLING_CNT	RESIDENT_CNT	OCPD_DWELLING_CNT	VACANT_DWELLING_CNT	OCPD_OWNERSHIP_CNT	RENOVATION_DWELLING_CNT	UNDER_CONST_DWELLING_CNT	INACTIVE_CNT	OTHER_PURPOSE_CNT- check all _CNT columns and ensure they are integer, remove ',' and convert to INTEGER

    
Link: https://data.calgary.ca/Demographics/Civic-Census-by-Community/s7f7-3gjj/data

Link: https://data.calgary.ca/Demographics/Civic-Census-by-Community-Age-and-Gender/vsk6-ghca/about_data


In [45]:
df_census= pd.read_csv("Civic_Census_by_Community_20240229.csv")
# Rename columns to uppercase
df_census.columns = df_census.columns.str.upper()
df_census.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5265 entries, 0 to 5264
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   CENSUS_YEAR               5265 non-null   int64  
 1   COMM_CODE                 5265 non-null   object 
 2   DWELLING_CNT              5265 non-null   object 
 3   RESIDENT_CNT              5265 non-null   object 
 4   OCPD_DWELLING_CNT         5265 non-null   object 
 5   VACANT_DWELLING_CNT       5265 non-null   object 
 6   OCPD_OWNERSHIP_CNT        5265 non-null   object 
 7   RENOVATION_DWELLING_CNT   5265 non-null   int64  
 8   UNDER_CONST_DWELLING_CNT  5265 non-null   object 
 9   INACTIVE_CNT              5042 non-null   float64
 10  OTHER_PURPOSE_CNT         4598 non-null   float64
dtypes: float64(2), int64(2), object(7)
memory usage: 452.6+ KB


In [6]:
df_census

Unnamed: 0,CENSUS_YEAR,COMM_CODE,DWELLING_CNT,RESIDENT_CNT,OCPD_DWELLING_CNT,VACANT_DWELLING_CNT,OCPD_OWNERSHIP_CNT,RENOVATION_DWELLING_CNT,UNDER_CONST_DWELLING_CNT,INACTIVE_CNT,OTHER_PURPOSE_CNT
0,2000,ST3,9,20,8,1,5,0,0,0.0,
1,2010,CIT,3479,10219,3426,41,3107,8,1,2.0,1.0
2,2013,COL,982,2243,941,32,704,6,2,1.0,0.0
3,2017,SVO,2512,6906,2327,59,2016,1,109,11.0,5.0
4,1998,HAY,2891,6125,2748,46,2724,0,0,,
...,...,...,...,...,...,...,...,...,...,...,...
5260,2017,SKR,3834,10043,3330,480,2467,0,16,3.0,5.0
5261,2001,LPK,999,1525,782,59,258,0,150,0.0,8.0
5262,2005,COR,1475,6037,1461,13,1422,1,0,0.0,0.0
5263,2019,CAN,3213,7624,3107,89,2379,13,1,1.0,2.0


In [5]:
# Define the database connection string
database_url = f"mysql+mysqlconnector://{user_name}:{passw}@datasciencedb.ucalgary.ca/{user_name}"
# Set the global logging level to WARNING
logging.getLogger().setLevel(logging.WARNING)
# Set the logging level for SQLAlchemy to WARNING, from now on no more INFO, CATEGORY
logging.getLogger('sqlalchemy').setLevel(logging.WARNING)
# Create the SQLAlchemy engine
engine = create_engine(database_url, echo=False)  # Set echo to True for debugging

# Test the connection
with engine.connect() as connection:
    result = connection.execute("SELECT 1")
    print(result.scalar())



1


In [53]:
# Define the Base class
Base = declarative_base()

# Define the Census class
class CensusData(Base):
    __tablename__ = 'census'
    # Assuming CENSUS_YEAR and COMM_CODE together form a composite primary key
    CENSUS_YEAR = Column(Integer, primary_key=True)
    COMM_CODE = Column(String(255), primary_key=True)
    DWELLING_CNT = Column(Integer)
    RESIDENT_CNT = Column(Integer)
    OCPD_DWELLING_CNT = Column(Integer)
    VACANT_DWELLING_CNT = Column(Integer)
    OCPD_OWNERSHIP_CNT = Column(Integer)
    RENOVATION_DWELLING_CNT = Column(Integer)
    UNDER_CONST_DWELLING_CNT = Column(Integer)
    INACTIVE_CNT = Column(Integer)
    OTHER_PURPOSE_CNT = Column(Integer)

Base.metadata.create_all(engine)


In [46]:
# Check for NaN values in the entire DataFrame
nan_check = df_census.isna().any()

# Print columns with NaN values
print("Columns with NaN values:")
print(nan_check[nan_check])
df_census.info()

Columns with NaN values:
INACTIVE_CNT         True
OTHER_PURPOSE_CNT    True
dtype: bool
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5265 entries, 0 to 5264
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   CENSUS_YEAR               5265 non-null   int64  
 1   COMM_CODE                 5265 non-null   object 
 2   DWELLING_CNT              5265 non-null   object 
 3   RESIDENT_CNT              5265 non-null   object 
 4   OCPD_DWELLING_CNT         5265 non-null   object 
 5   VACANT_DWELLING_CNT       5265 non-null   object 
 6   OCPD_OWNERSHIP_CNT        5265 non-null   object 
 7   RENOVATION_DWELLING_CNT   5265 non-null   int64  
 8   UNDER_CONST_DWELLING_CNT  5265 non-null   object 
 9   INACTIVE_CNT              5042 non-null   float64
 10  OTHER_PURPOSE_CNT         4598 non-null   float64
dtypes: float64(2), int64(2), object(7)
memory usage: 452.6+ KB


In [50]:
def convert_to_int(column):
    # Check if the column is already of an integer dtype
    if not pd.api.types.is_numeric_dtype(column):
        # If it's not numeric, assume it's a string that needs processing.
        # Convert to string first to ensure .str methods work, then replace commas and convert to numeric.
        column = pd.to_numeric(column.astype(str).str.replace(',', ''), errors='coerce').fillna(0)
    # Ensure the result is returned as integer
    return column.astype(int)

# Apply this function to each count column in df_census
count_columns = ['DWELLING_CNT', 'RESIDENT_CNT', 'OCPD_DWELLING_CNT', 
                 'VACANT_DWELLING_CNT', 'OCPD_OWNERSHIP_CNT', 'RENOVATION_DWELLING_CNT', 
                 'UNDER_CONST_DWELLING_CNT', 'INACTIVE_CNT', 'OTHER_PURPOSE_CNT']

for col in count_columns:
    df_census[col] = convert_to_int(df_census[col])


# Convert NaN to None
df_census.replace({np.nan: None}, inplace=True)
df_census.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5265 entries, 0 to 5264
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   CENSUS_YEAR               5265 non-null   int64 
 1   COMM_CODE                 5265 non-null   object
 2   DWELLING_CNT              5265 non-null   int32 
 3   RESIDENT_CNT              5265 non-null   int32 
 4   OCPD_DWELLING_CNT         5265 non-null   int32 
 5   VACANT_DWELLING_CNT       5265 non-null   int32 
 6   OCPD_OWNERSHIP_CNT        5265 non-null   int32 
 7   RENOVATION_DWELLING_CNT   5265 non-null   int32 
 8   UNDER_CONST_DWELLING_CNT  5265 non-null   int32 
 9   INACTIVE_CNT              5265 non-null   int32 
 10  OTHER_PURPOSE_CNT         5265 non-null   int32 
dtypes: int32(9), int64(1), object(1)
memory usage: 267.5+ KB


In [51]:
Session = sessionmaker(bind=engine)
session = Session()

In [54]:
#for index, row in df_census.head(5).iterrows():
for index, row in df_census.iterrows():
    try:
        census_statistic = CensusData(
            CENSUS_YEAR=row['CENSUS_YEAR'],
            COMM_CODE=row['COMM_CODE'],
            DWELLING_CNT=row['DWELLING_CNT'],
            RESIDENT_CNT=row['RESIDENT_CNT'],
            OCPD_DWELLING_CNT=row['OCPD_DWELLING_CNT'],
            VACANT_DWELLING_CNT=row['VACANT_DWELLING_CNT'],
            OCPD_OWNERSHIP_CNT=row['OCPD_OWNERSHIP_CNT'],
            RENOVATION_DWELLING_CNT=row['RENOVATION_DWELLING_CNT'],
            UNDER_CONST_DWELLING_CNT=row['UNDER_CONST_DWELLING_CNT'],
            INACTIVE_CNT=row['INACTIVE_CNT'],
            OTHER_PURPOSE_CNT=row['OTHER_PURPOSE_CNT']
        )
        if (index + 1) % 1000 == 0:
            # Print progress at every 1000 index
            progress = (index + 1) / len(df_census) * 100
            print(f"Progress: {progress:.2f}%")
        session.add(census_statistic)

    except Exception as e:
        print(f"Error in row {index}: {e}")
        print(row)  # Print the entire row for reference
        #session.rollback()  # Rollback the transaction to continue with the next row

session.commit()  # Commit the transaction


Progress: 37.99%
Progress: 75.97%


In [55]:
session.close()

In [26]:
#session.rollback()