In [17]:
import pandas as pd
import sqlite3
import plotly.express as px
import warnings 

# Ignore all warnings
warnings.filterwarnings('ignore')

# Step 1: Read the CSV File
csv_file = r'C:\Users\Wolfrank\Desktop\Data-SQL\ManhattanRE\Data\ManhattanData.csv'
df = pd.read_csv(csv_file).round(2)

# Print or Display DataFrame
display(df)

# SQL QUERIES & PLOTLY EXPRESS VISUALIZATIONS

# Connect to the SQLite database
conn = sqlite3.connect('manhattan_sales.db')

# Create the SQL table
df.to_sql('sales_data', conn, if_exists='replace', index=False)


Unnamed: 0,RECORD INDEX,NEIGHBORHOOD,BLDGCAT,BLDGDESCRIPTION,TAXCLP,BLOCK,LOT,BLDGCP,ADDRESS,ZIP_CODE,RESIDENTIAL_ UNITS,COMMERCIAL_UNITS,UNITS,LANDSFT,GROSSSF,BUILT,TAXCLSALE,BLDGCTOS,SALE_PRICE,SALE_DATE
0,1,ALPHABET CITY,2,TWO FAMILY DWELLINGS,1,372,36,S2,19 AVENUE D,10009,2.0,1.0,3.0,826,2481,1900.0,1,S2,$1.00,1/20/2023
1,2,ALPHABET CITY,7,RENTALS - WALKUP APARTMENTS,2,390,50,C1,"209 EAST 7TH STREET, 3E",10009,29.0,0.0,29.0,3900,18099,1900.0,2,C1,"$154,196.00",9/15/2023
2,3,ALPHABET CITY,7,RENTALS - WALKUP APARTMENTS,2,390,50,C1,"209 EAST 7TH STREET, 4E",10009,29.0,0.0,29.0,3900,18099,1900.0,2,C1,"$154,196.00",9/15/2023
3,4,ALPHABET CITY,7,RENTALS - WALKUP APARTMENTS,2A,390,60,C2,191 EAST 7TH STREET,10009,5.0,0.0,5.0,1080,3328,1910.0,2,C2,"$2,725,000.00",8/10/2023
4,5,ALPHABET CITY,7,RENTALS - WALKUP APARTMENTS,2B,396,4,C4,221 AVENUE B,10009,8.0,1.0,9.0,1936,5538,1904.0,2,C4,$10.00,3/31/2023
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16937,16938,WASHINGTON HEIGHTS UPPER,22,STORE BUILDINGS,4,2175,63,K1,180 DYCKMAN STREET,10040,0.0,5.0,5.0,5000,5000,1927.0,4,K1,$0.00,10/31/2023
16938,16939,WASHINGTON HEIGHTS UPPER,22,STORE BUILDINGS,4,2175,66,K1,172 DYCKMAN STREET,10040,0.0,6.0,6.0,10000,9500,1927.0,4,K1,$0.00,6/20/2023
16939,16940,WASHINGTON HEIGHTS UPPER,22,STORE BUILDINGS,4,2175,66,K1,172 DYCKMAN STREET,10040,0.0,6.0,6.0,10000,9500,1927.0,4,K1,$0.00,9/22/2023
16940,16941,WASHINGTON HEIGHTS UPPER,29,COMMERCIAL GARAGES,4,2172,1,G6,4560 BROADWAY,10040,0.0,0.0,0.0,15450,0,,4,G6,"$15,000,000.00",11/16/2022


16942

In [18]:


# SQL Queries 

# Query 1: Number of properties sold per neighborhood
query1 = """
SELECT NEIGHBORHOOD, COUNT(*) AS num_sales
FROM sales_data
GROUP BY NEIGHBORHOOD
ORDER BY num_sales DESC
"""
df_sales_by_neighborhood = pd.read_sql_query(query1, conn)

# Visualization 1: Bar chart for Number of properties sold per neighborhood
fig1 = px.bar(df_sales_by_neighborhood, x='NEIGHBORHOOD', y='num_sales',
              title='Number of Properties Sold per Neighborhood',
              labels={'num_sales': 'Number of Sales', 'NEIGHBORHOOD': 'Neighborhood'})

# Show the plot
fig1.show()


In [19]:

# Query 2
query2 = """
SELECT BLDGCAT, SALE_PRICE
FROM sales_data
WHERE SALE_PRICE IS NOT NULL;
"""

df2 = pd.read_sql_query(query2, conn)

# Visualization 2: Violin plot for Distribution of Sale Prices by Building Category
fig2 = px.violin(df2, x="BLDGCAT", y="SALE_PRICE", title="Distribution of Sale Prices by Building Category")

# Show the plot
fig2.show()


In [20]:
query6 = """
SELECT SALE_DATE, SALE_PRICE
FROM sales_data
WHERE SALE_PRICE IS NOT NULL;
"""

df6 = pd.read_sql_query(query6, conn)

# Visualization 6: Scatter plot for sale price distribution over time
fig6 = px.scatter(df6, x='SALE_DATE', y='SALE_PRICE', title='Sale Price Distribution Over Time')
fig6.show()

In [21]:

# Query 4: Sale price distribution by neighborhood
query4 = """
SELECT NEIGHBORHOOD, SALE_PRICE
FROM sales_data
WHERE SALE_PRICE IS NOT NULL;
"""
df4 = pd.read_sql_query(query4, conn)

# Visualization 4: Box plot for Sale Price Distribution by Neighborhood
fig4 = px.box(df4, x="NEIGHBORHOOD", y="SALE_PRICE", title="Sale Price Distribution by Neighborhood")

# Show the plot
fig4.show()

In [22]:
# Query 5: Number of properties sold by tax class
query5 = """
SELECT TAXCLP, COUNT(*) AS num_sales
FROM sales_data
GROUP BY TAXCLP;
"""
df5 = pd.read_sql_query(query5, conn)

# Visualization 5: Pie chart for Number of Properties Sold by Tax Class
fig5 = px.pie(df5, names='TAXCLP', values='num_sales', title='Number of Properties Sold by Tax Class')

# Show the plot
fig5.show()

In [23]:

# Connect to the SQLite database
conn = sqlite3.connect('manhattan_sales.db')

# Create the SQL table
df.to_sql('sales_data', conn, if_exists='replace', index=False)

# Query 5: Average GROSSSF by NEIGHBORHOOD
query5 = """
SELECT NEIGHBORHOOD, AVG(GROSSSF) AS avg_gross_sf
FROM sales_data
GROUP BY NEIGHBORHOOD;
"""
df5 = pd.read_sql_query(query5, conn)

# Display the average GROSSSF DataFrame
print("Average GROSSSF by NEIGHBORHOOD:")
display(df5)

# Visualization 5: Heatmap for Average GROSSSF by NEIGHBORHOOD
fig5 = px.imshow(df5.pivot_table(index="NEIGHBORHOOD", values="avg_gross_sf"),
                  labels=dict(index="Neighborhood", value="Average GROSSSF"),
                  title="Average GROSSSF by Neighborhood")

# Show the plot
fig5.show()

# Close the database connection
conn.close()


Average GROSSSF by NEIGHBORHOOD:


Unnamed: 0,NEIGHBORHOOD,avg_gross_sf
0,ALPHABET CITY,7.176471
1,CHELSEA,34.634921
2,CHINATOWN,8.285714
3,CIVIC CENTER,82.5
4,CLINTON,31.666667
5,EAST VILLAGE,8.87037
6,FASHION,116.9
7,FINANCIAL,217.066667
8,FLATIRON,77.842105
9,GRAMERCY,42.789474
