<font size="5"><b>Project Objectives:</b></font>

<font size="3">- With different datasets, create and manipulate databases with SQLite3;</font>

<font size="3">- Answer different kinds of business questions with queries;</font>

<font size="3">- Create DataFrame objects as results from queries for a better visualization, this also make possible to start drawing inferences utilizing Pandas DataFrame functions.</font>

<font size="3"><b>Imports for data and database manipulation</b></font>

In [1]:
import pandas as pd
import sqlite3 as s3

<font size="4">Initial settings</font>

In [2]:
# Removing limitations by setting it to None to display all columns.
pd.set_option('display.max_columns', None)
# Removing limitations by setting it to None to display all rows.
pd.set_option('display.max_rows', None) 

# Function to return all the queries into dataframes
def query_results(cursor):
    # Get the results from the query
    results = cursor.fetchall()

    # Get the column names from the cursor description
    column_names = [description[0] for description in cursor.description]

    # Create the DataFrame from the results and column names
    df_results = pd.DataFrame(results, columns=column_names)

    # Query results
    return df_results

<font size="5"><b>#005 Dataset :  Sales - Products - Inventory Datasets</b></font>
* Creating a database with three tables, with some primary and foreign keys.

In [3]:
df_sales = pd.read_csv('Sales.csv')
df_products = pd.read_csv('Products.csv')
df_inventory = pd.read_csv('Inventory.csv')
pd.set_option('display.max_columns', None) 
pd.set_option('display.max_rows', None)

conn = s3.connect('Sales-products-inventory.sqlite')
cursor = conn.cursor()

# Salve os dados do Pandas no banco de dados SQLite3
df_sales.to_sql('Sales', conn, if_exists='replace', index=False)
df_products.to_sql('Products', conn, if_exists='replace', index=False)
df_inventory.to_sql('Inventory', conn, if_exists='replace', index=False)

1000

In [4]:
print(df_sales.info())
df_sales.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   SalesId    200000 non-null  int64  
 1   StoreId    200000 non-null  int64  
 2   ProductId  200000 non-null  int64  
 3   Date       200000 non-null  object 
 4   UnitPrice  200000 non-null  float64
 5   Quantity   200000 non-null  int64  
dtypes: float64(1), int64(4), object(1)
memory usage: 9.2+ MB
None


Unnamed: 0,SalesId,StoreId,ProductId,Date,UnitPrice,Quantity
0,82319,22726,590,2019-12-02,0.0525,93
1,15022,21754,390,2017-11-19,5.11,28
2,11624,71053,883,2020-07-13,7.3675,33
3,63101,22914,658,2019-05-12,2.0825,76
4,29702,22623,632,2020-07-20,0.6475,8


In [5]:
print(df_products.info())
df_products.head(1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1001 entries, 0 to 1000
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   ProductId    1000 non-null   float64
 1   ProductName  1000 non-null   object 
 2   Supplier     1000 non-null   object 
 3   ProductCost  1000 non-null   float64
dtypes: float64(2), object(2)
memory usage: 31.4+ KB
None


Unnamed: 0,ProductId,ProductName,Supplier,ProductCost
0,1.0,Chocolate Bar - Smarties,National Stores,1.25


In [6]:
print(df_inventory.info())
df_inventory.head(1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   ProductId          1000 non-null   int64 
 1   StoreId            1000 non-null   int64 
 2   StoreName          999 non-null    object
 3   Address            1000 non-null   object
 4   neighborhood       1000 non-null   object
 5   QuantityAvailable  1000 non-null   int64 
dtypes: int64(3), object(3)
memory usage: 47.0+ KB
None


Unnamed: 0,ProductId,StoreId,StoreName,Address,neighborhood,QuantityAvailable
0,1,85123,National Stores,9 Springview Point,Bolton Hill,11


<font size="4">Business Question: Finding the <b>first</b> and <b>last</b> sale date - Range of the sales records</font>

In [7]:
cursor.execute('''
SELECT *
FROM Sales
WHERE Date = (SELECT MIN(Date) FROM Sales)
   OR Date = (SELECT MAX(Date) FROM Sales)
LIMIT 2;
''')
query_results(cursor)

Unnamed: 0,SalesId,StoreId,ProductId,Date,UnitPrice,Quantity
0,81057,22310,159,2020-12-30,0.8925,31
1,63931,22310,55,2017-01-01,4.48,34


<font size="2"><b>First Sale Date</b>: 2017-01-01</font>

<font size="2"><b>Last Sale Date</b>: 2020-12-30</font>

In [8]:
cursor.execute('''
SELECT CAST(JULIANDAY(MAX(Date)) - JULIANDAY(MIN(Date)) AS INTEGER) AS days_duration
FROM Sales;
''')

query_results(cursor)

Unnamed: 0,days_duration
0,1459


<font size="2">Number of Days Between the <b>First</b> and <b>Last</b> Sale Dates: <b>1459 days</b></font>

<font size="4">Business Question: Identifying the <b>10 Most Sold Products</b> from 2017 to 2020</font>

In [9]:
cursor.execute('''
SELECT
    strftime('%Y', date) AS year,
    strftime('%m', date) AS month,
    ProductId,
    ROUND(MAX(UnitPrice), 2) AS UnitPrice,
    SUM(Quantity) AS UnitsSold
FROM
    Sales
GROUP BY
    year,
    month,
    ProductId 
ORDER BY UnitsSold DESC
LIMIT 10
''')
query_results(cursor)

Unnamed: 0,year,month,ProductId,UnitPrice,UnitsSold
0,2017,9,221,1.23,882
1,2017,11,869,4.71,873
2,2020,12,219,2.77,861
3,2018,6,899,1.09,817
4,2018,7,381,1.31,809
5,2019,8,848,3.34,795
6,2018,2,961,1.3,781
7,2018,1,575,3.06,777
8,2019,10,875,1.05,770
9,2017,1,639,2.91,763


<font size="4">Business Question: <b>Total</b> Sales Revenue <b>Each Year</b></font>

In [10]:
cursor.execute('''
SELECT 
    strftime('%Y', Date) AS year, 
    ROUND(SUM(UnitPrice * Quantity), 2) AS total_revenue_millions 
FROM 
    Sales 
GROUP BY 
    year 
ORDER BY 
    year;
''')
df_results = query_results(cursor)
df_results['total_revenue_millions'] = df_results['total_revenue_millions'].apply(lambda x: f'{x/1e6:.2f} millions')
df_results

Unnamed: 0,year,total_revenue_millions
0,2017,8.37 millions
1,2018,8.43 millions
2,2019,8.25 millions
3,2020,8.33 millions


<font size="4">Business Question: <b>Top Selling</b> Products</font>

In [11]:
cursor.execute('''
SELECT
    ProductId,
    SUM(Quantity) AS total_units_sold
FROM Sales
GROUP BY ProductId
ORDER BY total_units_sold DESC
LIMIT 10;
''')
query_results(cursor)

Unnamed: 0,ProductId,total_units_sold
0,530,12937
1,392,12711
2,133,12459
3,262,12435
4,991,12363
5,603,12361
6,837,12167
7,345,12068
8,668,12051
9,892,12035


<font size="4">Business Question: Average Unit Price</font>

In [12]:
cursor.execute('''
SELECT AVG(UnitPrice) AS average_unit_price
FROM Sales;
''')
query_results(cursor)

Unnamed: 0,average_unit_price
0,3.313406


<font size="4">Business Question: Store with Highest Revenue</font>

In [13]:
cursor.execute('''
SELECT
    StoreId,
    SUM(UnitPrice * Quantity) AS total_revenue
FROM Sales
GROUP BY StoreId
ORDER BY total_revenue DESC
LIMIT 1;
''')

df_results = query_results(cursor)
df_results['total_revenue'] = df_results['total_revenue'].apply(lambda x: f'{x/1e6:.2f} millions')
df_results

Unnamed: 0,StoreId,total_revenue
0,22748,1.02 millions


<font size="4">Business Question: <b>Most Common</b> Purchase <b>Size</b></font>

In [14]:
cursor.execute('''
SELECT
    Quantity,
    COUNT(*) AS frequency
FROM Sales
GROUP BY Quantity
ORDER BY frequency DESC
LIMIT 1;
''')
query_results(cursor)

Unnamed: 0,Quantity,frequency
0,73,2083


<font size="4">Business Question: Sales <b>Distribution</b> by <b>Month</b></font>

In [15]:
cursor.execute('''
SELECT
    strftime('%Y-%m', Date) AS year_month,
    SUM(UnitPrice * Quantity) AS monthly_revenue
FROM Sales
GROUP BY year_month
ORDER BY year_month;
''')
df_results = query_results(cursor)
df_results.head()

Unnamed: 0,year_month,monthly_revenue
0,2017-01,701304.66
1,2017-02,634254.53
2,2017-03,721855.89
3,2017-04,700876.2425
4,2017-05,712926.76


<font size="5"><b>#006 Dataset : Movie Dataset</b></font>

In [16]:
df_movie = pd.read_csv('Movie-Data.csv')
conn = s3.connect('Movie_Data.sqlite')
cursor = conn.cursor()

# Salve os dados do Pandas no banco de dados SQLite3
df_movie.to_sql('Movie_Data', conn, if_exists='replace', index=False)

print(df_movie.info())
df_movie.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 508 entries, 0 to 507
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Movie Title    508 non-null    object
 1   Release Date   508 non-null    object
 2   Wikipedia URL  508 non-null    object
 3   Genre          508 non-null    object
 4   Director (1)   508 non-null    object
 5   Director (2)   41 non-null     object
 6   Cast (1)       508 non-null    object
 7   Cast (2)       503 non-null    object
 8   Cast (3)       485 non-null    object
 9   Cast (4)       452 non-null    object
 10  Cast (5)       389 non-null    object
 11  Budget         508 non-null    object
 12  Revenue        508 non-null    object
dtypes: object(13)
memory usage: 51.7+ KB
None


Unnamed: 0,Movie Title,Release Date,Wikipedia URL,Genre,Director (1),Director (2),Cast (1),Cast (2),Cast (3),Cast (4),Cast (5),Budget,Revenue
0,10 Cloverfield Lane,2016-03-08,https://en.wikipedia.org/wiki/10_Cloverfield_Lane,Thriller,Dan Trachtenberg,,Mary Elizabeth Winstead,John Goodman,John Gallagher,,,"$15,000,000.00","$108,300,000.00"
1,13 Hours: The Secret Soldiers of Benghazi,2016-01-15,https://en.wikipedia.org/wiki/13_Hours:_The_Se...,Action,Michael Bay,,James Badge Dale,John Krasinski,Toby Stephens,Pablo Schreiber,Max Martini,"$45,000,000.00","$69,400,000.00"
2,2 Guns,2013-08-02,https://en.wikipedia.org/wiki/2_Guns,Action,Baltasar Kormákur,,Mark Wahlberg,Denzel Washington,Paula Patton,Bill Paxton,Edward James Olmos,"$61,000,000.00","$131,900,000.00"
3,21 Jump Street,2012-03-16,https://en.wikipedia.org/wiki/21_Jump_Street_(...,Comedy,Phil Lord,Chris Miller,Jonah Hill,Channing Tatum,Ice Cube,Brie Larson,Rob Riggle,"$55,000,000.00","$201,500,000.00"
4,22 Jump Street,2014-06-04,https://en.wikipedia.org/wiki/22_Jump_Street,Action,Phil Lord,Chris Miller,Channing Tatum,Jonah Hill,Ice Cube,,,"$84,500,000.00","$331,300,000.00"


In [17]:
# Removing extra spaces on Budget column
cursor.execute('''
UPDATE Movie_Data
SET "Budget " = TRIM("Budget ")
''')
conn.commit()

In [18]:
# Renaming column
cursor.execute('''
ALTER TABLE Movie_Data RENAME COLUMN "Budget " TO "Budget"
''')
conn.commit()

<font size="4">Business Question: What is the <b>average budget and revenue </b> for movies in each genre?</b></font>

In [19]:
cursor.execute('''
SELECT
    Genre,
    AVG(CAST(REPLACE(Budget, '$', '') AS REAL)) AS average_budget,
    AVG(CAST(REPLACE(Revenue, '$', '') AS REAL)) AS average_revenue
FROM
    Movie_Data
GROUP BY
    Genre;
''')
query_results(cursor)


Unnamed: 0,Genre,average_budget,average_revenue
0,Action,82.783333,233.425
1,Adventure,82.809524,308.238095
2,Animation,89.666667,276.0
3,Biography,28.733333,58.333333
4,Comedy,39.857143,123.061224
5,Crime,30.391304,57.173913
6,Documentary,10.0,68.0
7,Drama,26.674157,80.573034
8,Family,78.416667,270.5
9,Fantasy,91.0,244.3


<font size="4">Business Question: What are the <b>top 5 directors</b> who directed the <b>most</b> movies?</b></font>

In [20]:
cursor.execute('''
SELECT
    "Director (1)" AS director_name,
    COUNT("Movie Title") AS movie_count
FROM
    Movie_Data
GROUP BY
    director_name
ORDER BY
    movie_count DESC
LIMIT 5;
''')
query_results(cursor)


Unnamed: 0,director_name,movie_count
0,David Ayer,4
1,Woody Allen,3
2,Tim Story,3
3,Steven Spielberg,3
4,Shawn Levy,3


<font size="4">Business Question: How <b>many movies</b> fall into each budget range?</b></font>

In [21]:
cursor.execute('''
SELECT
    CASE
        WHEN CAST(REPLACE(Budget, '$', '') AS REAL) < 50000000 THEN 'Low Budget'
        WHEN CAST(REPLACE(Budget, '$', '') AS REAL) >= 50000000 AND CAST(REPLACE(Budget, '$', '') AS REAL) < 100000000 THEN 'Medium Budget'
        ELSE 'High Budget'
    END AS budget_range,
    COUNT(*) AS movie_count
FROM
    Movie_Data
GROUP BY
    budget_range;
''')
query_results(cursor)


Unnamed: 0,budget_range,movie_count
0,Low Budget,508


<font size="4">Business Question: What is the <b>average</b> revenue for movies directed by each director?</b></font>

In [22]:
cursor.execute('''
SELECT
    "Director (1)" AS director_name,
    AVG(CAST(REPLACE(Revenue, '$', '') AS REAL)) AS average_revenue
FROM
    Movie_Data
GROUP BY
    director_name
ORDER BY average_revenue DESC;
''')
query_results(cursor)

Unnamed: 0,director_name,average_revenue
0,Peter Jackson,956.0
1,Bill Condon,829.0
2,Tim Miller,782.0
3,Zack Snyder,770.0
4,Marc Webb,757.0
5,Eric Darnell,746.0
6,Anthony Russo,714.0
7,Matt Reeves,710.0
8,Francis Lawrence,704.0
9,Chris Renaud,680.666667


<font size="4">Business Question: How <b>many movies</b> released each year?</b></font>

In [23]:
cursor.execute('''
SELECT
    STRFTIME('%Y', "Release Date") AS release_year,
    COUNT(*) AS movie_count
FROM
    Movie_Data
GROUP BY
    release_year
ORDER BY
    release_year;
''')
query_results(cursor)

Unnamed: 0,release_year,movie_count
0,2012,106
1,2013,85
2,2014,119
3,2015,124
4,2016,74
