## Setup

Importing relevant modules, connecting to a database and creating a cursor

In [1]:
import sqlite3
import pandas as pd
import json

In [2]:
con = sqlite3.connect("dev/cademycode.db")
cur = con.cursor()

## Inspect and Clean the data

Import the tables in cademycode.db as dataframes. Inspect the tables for missing or invalid data and perform any data cleaning operations you think are necessary.

In [3]:
schema_query = cur.execute("""SELECT name, sql FROM sqlite_master""")

# type text,
# name text,
# tbl_name text,
# rootpage integer,
# sql text

schema_query.fetchall()

[('cademycode_students',
  'CREATE TABLE cademycode_students (\n\tuuid INTEGER, \n\tname VARCHAR, \n\tdob VARCHAR, \n\tsex TEXT, \n\tcontact_info JSON, \n\tjob_id VARCHAR, \n\tnum_course_taken VARCHAR, \n\tcurrent_career_path_id VARCHAR, \n\ttime_spent_hrs VARCHAR\n)'),
 ('cademycode_courses',
  'CREATE TABLE cademycode_courses (\n\tcareer_path_id BIGINT, \n\tcareer_path_name TEXT, \n\thours_to_complete BIGINT\n)'),
 ('cademycode_student_jobs',
  'CREATE TABLE cademycode_student_jobs (\n\tjob_id BIGINT, \n\tjob_category TEXT, \n\tavg_salary BIGINT\n)')]

###  Import and explore 'cademycode_students' table

In [4]:
students_query = """SELECT * FROM cademycode_students"""
df_students = pd.read_sql_query(students_query, con)

In [5]:
print("\nStudents DF Shape")
df_students.shape


Students DF Shape


(5000, 9)

In [6]:
print("Students DF info")
df_students.info()

print("\nStudents DF first 10 rows")
df_students.head(10)

Students DF info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   uuid                    5000 non-null   int64 
 1   name                    5000 non-null   object
 2   dob                     5000 non-null   object
 3   sex                     5000 non-null   object
 4   contact_info            5000 non-null   object
 5   job_id                  4995 non-null   object
 6   num_course_taken        4749 non-null   object
 7   current_career_path_id  4529 non-null   object
 8   time_spent_hrs          4529 non-null   object
dtypes: int64(1), object(8)
memory usage: 351.7+ KB

Students DF first 10 rows


Unnamed: 0,uuid,name,dob,sex,contact_info,job_id,num_course_taken,current_career_path_id,time_spent_hrs
0,1,Annabelle Avery,1943-07-03,F,"{""mailing_address"": ""303 N Timber Key, Irondal...",7.0,6.0,1.0,4.99
1,2,Micah Rubio,1991-02-07,M,"{""mailing_address"": ""767 Crescent Fair, Shoals...",7.0,5.0,8.0,4.4
2,3,Hosea Dale,1989-12-07,M,"{""mailing_address"": ""P.O. Box 41269, St. Bonav...",7.0,8.0,8.0,6.74
3,4,Mariann Kirk,1988-07-31,F,"{""mailing_address"": ""517 SE Wintergreen Isle, ...",6.0,7.0,9.0,12.31
4,5,Lucio Alexander,1963-08-31,M,"{""mailing_address"": ""18 Cinder Cliff, Doyles b...",7.0,14.0,3.0,5.64
5,6,Shavonda Mcmahon,1989-10-15,F,"{""mailing_address"": ""P.O. Box 81591, Tarpon Sp...",6.0,10.0,3.0,10.12
6,7,Terrell Bleijenberg,1959-05-05,M,"{""mailing_address"": ""P.O. Box 53471, Oskaloosa...",2.0,9.0,8.0,24.17
7,8,Stanford Allan,1997-11-22,M,"{""mailing_address"": ""255 Spring Avenue, Point ...",3.0,3.0,1.0,19.54
8,9,Tricia Delacruz,1961-10-20,F,"{""mailing_address"": ""997 Dewy Apple, Lake Lind...",1.0,6.0,9.0,1.75
9,10,Regenia van der Helm,1999-02-23,N,"{""mailing_address"": ""220 Middle Ridge, Falcon ...",5.0,7.0,6.0,13.55


In [7]:
df_students['mailing_address'] = df_students['contact_info'].apply(lambda x: json.loads(x).get('mailing_address', None))

In [8]:
df_students['email'] = df_students['contact_info'].apply(lambda x: json.loads(x).get('email', None))

In [9]:
df_students = df_students.drop(columns= {'contact_info'})

In [10]:
# reorder colums logically
df_students = df_students[["uuid", "name", "dob", "sex", "mailing_address", "email", "job_id", "num_course_taken", "current_career_path_id", "time_spent_hrs"]]

In [11]:
df_students = df_students.rename(columns={"current_career_path_id":"career_path_id"})

In [140]:
df_students['job_id'] = df_students['job_id'].fillna(0)
df_students['num_course_taken'] = df_students['num_course_taken'].fillna(0)
df_students['career_path_id'] = df_students['career_path_id'].fillna(0)

In [142]:
df_students_person_info = df_students['name'] + " " + df_students['dob'] + " " + df_students['email']
df_students_person_info.nunique()

5000

In [143]:
#change data types in the dataframe

df_students = df_students.astype(
    {'job_id': 'float64', 'num_course_taken': 'float64', 'career_path_id': 'float64', 'time_spent_hrs': 'float64'}
)

# df_students = df_students.fillna({
#     'job_id': None,
#     'num_course_taken': None,
#     'current_career_path_id': None
# })

df_students = df_students.astype(
    {'job_id': 'Int64', 'num_course_taken': 'Int64', 'career_path_id': 'Int64'}
)
df_students.dtypes

uuid                  int64
name                 object
dob                  object
sex                  object
mailing_address      object
email                object
job_id                Int64
num_course_taken      Int64
career_path_id        Int64
time_spent_hrs      float64
dtype: object

In [97]:
df_students.describe()

Unnamed: 0,uuid,job_id,num_course_taken,career_path_id,time_spent_hrs
count,5000.0,4995.0,4749.0,4529.0,4529.0
mean,2500.5,4.172172,7.533586,5.478914,11.520625
std,1443.520003,2.148098,4.609884,2.874135,7.564228
min,1.0,1.0,0.0,1.0,0.0
25%,1250.75,2.0,4.0,3.0,5.38
50%,2500.5,4.0,7.0,5.0,10.67
75%,3750.25,6.0,12.0,8.0,16.75
max,5000.0,8.0,15.0,10.0,35.98


In [98]:
## percentage of null values in each column
df_students.isna().sum() / len(df_students.index) * 100

uuid                0.00
name                0.00
dob                 0.00
sex                 0.00
mailing_address     0.00
email               0.00
job_id              0.10
num_course_taken    5.02
career_path_id      9.42
time_spent_hrs      9.42
dtype: float64

###  Import and explore 'cademycode_courses' table

In [99]:
courses_query = """SELECT * FROM cademycode_courses"""
df_courses = pd.read_sql_query(courses_query, con)

In [100]:
print('Courses DF shape')
df_courses.shape

Courses DF shape


(10, 3)

In [101]:
df_courses = df_courses.astype(
    {'career_path_id': 'Int64', 'hours_to_complete': 'Int64'}
)

In [145]:
print("Courses DF info")
df_courses.info()

print("\nCourses DF first 10 rows")
df_courses.head(10)

Courses DF info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   career_path_id     10 non-null     Int64 
 1   career_path_name   10 non-null     object
 2   hours_to_complete  10 non-null     Int64 
dtypes: Int64(2), object(1)
memory usage: 388.0+ bytes

Courses DF first 10 rows


Unnamed: 0,career_path_id,career_path_name,hours_to_complete
0,1,data scientist,20
1,2,data engineer,20
2,3,data analyst,12
3,4,software engineering,25
4,5,backend engineer,18
5,6,frontend engineer,20
6,7,iOS developer,27
7,8,android developer,27
8,9,machine learning engineer,35
9,10,ux/ui designer,15


In [103]:
print(df_courses.to_string())

   career_path_id           career_path_name  hours_to_complete
0               1             data scientist                 20
1               2              data engineer                 20
2               3               data analyst                 12
3               4       software engineering                 25
4               5           backend engineer                 18
5               6          frontend engineer                 20
6               7              iOS developer                 27
7               8          android developer                 27
8               9  machine learning engineer                 35
9              10             ux/ui designer                 15


###  Import and explore 'cademycode_courses' table

In [104]:
jobs_query = """SELECT * FROM cademycode_student_jobs"""
df_jobs = pd.read_sql_query(jobs_query, con)

In [105]:
print('Jobs DF shape')
df_jobs.shape

Jobs DF shape


(13, 3)

In [128]:
print("Jobs DF info")
df_jobs.info()

print("\nJobs DF first 10 rows")
df_jobs.head(20)

Jobs DF info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13 entries, 0 to 12
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   job_id        13 non-null     Int64 
 1   job_category  13 non-null     object
 2   avg_salary    13 non-null     Int64 
dtypes: Int64(2), object(1)
memory usage: 466.0+ bytes

Jobs DF first 10 rows


Unnamed: 0,job_id,job_category,avg_salary
0,1,analytics,86000
1,2,engineer,101000
2,3,software developer,110000
3,4,creative,66000
4,5,financial services,135000
5,6,education,61000
6,7,HR,80000
7,8,student,10000
8,9,healthcare,120000
9,0,other,80000


In [107]:
df_jobs = df_jobs.astype(
    {'job_id': 'Int64', 'avg_salary': 'Int64'}
)

In [108]:
df_jobs.head()

Unnamed: 0,job_id,job_category,avg_salary
0,1,analytics,86000
1,2,engineer,101000
2,3,software developer,110000
3,4,creative,66000
4,5,financial services,135000


## Create the Output CSV ##

Use the cleaned tables to produce an analytics-ready SQLite database and flat CSV file. The final CSV should contain all the data the analysts might need in a single table.

###  Transforming the data

In [148]:
df_merged = pd.merge(df_students, df_courses,  how='left', left_on="career_path_id", right_on="career_path_id")

In [149]:
df_final = pd.merge(df_merged, df_jobs, how="left", left_on="job_id", right_on="job_id")

In [150]:
df_final.shape[0]

7006

In [146]:
df_students.shape[0]

5000

In [113]:
df_students_data['_completed_path'] = df_students_data.apply(
    lambda row: False if pd.isna(row['time_spent_hrs']) or pd.isna(row['hours_to_complete']) else row['time_spent_hrs'] > row['hours_to_complete'], 
    axis=1)

In [114]:
df_students_data.head()

Unnamed: 0,uuid,name,dob,sex,mailing_address,email,job_id,num_course_taken,career_path_id,time_spent_hrs,career_path_name,hours_to_complete,job_category,avg_salary,_completed_path
0,1,Annabelle Avery,1943-07-03,F,"303 N Timber Key, Irondale, Wisconsin, 84736",annabelle_avery9376@woohoo.com,7,6,1,4.99,data scientist,20,HR,80000,False
1,2,Micah Rubio,1991-02-07,M,"767 Crescent Fair, Shoals, Indiana, 37439",rubio6772@hmail.com,7,5,8,4.4,android developer,27,HR,80000,False
2,3,Hosea Dale,1989-12-07,M,"P.O. Box 41269, St. Bonaventure, Virginia, 83637",hosea_dale8084@coldmail.com,7,8,8,6.74,android developer,27,HR,80000,False
3,4,Mariann Kirk,1988-07-31,F,"517 SE Wintergreen Isle, Lane, Arkansas, 82242",kirk4005@hmail.com,6,7,9,12.31,machine learning engineer,35,education,61000,False
4,5,Lucio Alexander,1963-08-31,M,"18 Cinder Cliff, Doyles borough, Rhode Island,...",alexander9810@hmail.com,7,14,3,5.64,data analyst,12,HR,80000,False


In [115]:
print(df_students_data[df_students_data["_completed_path"] == True])

      uuid                   name         dob sex  \
38      39           Miki Darvill  1996-03-15   F   
45      46         Janett Fleming  1973-10-25   F   
78      79           Kizzy Mendez  1952-09-26   F   
87      88  Danilo van Zijderveld  1981-07-11   M   
111    112         Brandie Montes  1975-03-29   N   
...    ...                    ...         ...  ..   
4961  4962          Eliseo Morris  1962-02-02   M   
4969  4970          Rochell Morin  1958-05-10   F   
4978  4979          Cole Mitchell  1958-11-05   M   
4993  4994      Katherina Ruloffs  1979-06-13   F   
4997  4998         Brock Mckenzie  2004-11-25   M   

                                        mailing_address  \
38         481 Silent Harbor, Topsfield, Montana, 60773   
45        91 Velvet Road, South Venice, Virginia, 40051   
78    576 NE Fourth Terrace, Clawson, New Hampshire,...   
87    111 Easy Embers Plain, East Brewton, Arkansas,...   
111               P.O. Box 35869, Ottumwa, Maine, 20129   
...      

In [139]:
df_students_data.isnull().values.any()

True

In [117]:
df_students_data_person_info = df_students_data['name'] + " " + df_students_data['dob'] + " " + df_students_data['email']
df_students_data_person_info.nunique()

5000

In [138]:
df_students_data.isna().sum()

uuid                   0
name                   0
dob                    0
sex                    0
mailing_address        0
email                  0
job_id                 5
num_course_taken     251
career_path_id       471
time_spent_hrs       471
career_path_name     471
hours_to_complete    471
job_category           5
avg_salary             5
_completed_path        0
dtype: int64

### Loading data into a new table in sqlite3 and exporting as CSV ###

In [119]:
df_students_data.to_sql(name='students_data', con=con, if_exists="replace")

In [123]:
students_data_query = """SELECT * FROM students_data LIMIT 10"""

In [125]:
print(cur.execute(students_data_query).fetchall())

[(0, 1, 'Annabelle Avery', '1943-07-03', 'F', '303 N Timber Key, Irondale, Wisconsin, 84736', 'annabelle_avery9376@woohoo.com', 7, 6, 1, 4.99, 'data scientist', 20, 'HR', 80000, 0), (1, 2, 'Micah Rubio', '1991-02-07', 'M', '767 Crescent Fair, Shoals, Indiana, 37439', 'rubio6772@hmail.com', 7, 5, 8, 4.4, 'android developer', 27, 'HR', 80000, 0), (2, 3, 'Hosea Dale', '1989-12-07', 'M', 'P.O. Box 41269, St. Bonaventure, Virginia, 83637', 'hosea_dale8084@coldmail.com', 7, 8, 8, 6.74, 'android developer', 27, 'HR', 80000, 0), (3, 4, 'Mariann Kirk', '1988-07-31', 'F', '517 SE Wintergreen Isle, Lane, Arkansas, 82242', 'kirk4005@hmail.com', 6, 7, 9, 12.31, 'machine learning engineer', 35, 'education', 61000, 0), (4, 5, 'Lucio Alexander', '1963-08-31', 'M', '18 Cinder Cliff, Doyles borough, Rhode Island, 73737', 'alexander9810@hmail.com', 7, 14, 3, 5.64, 'data analyst', 12, 'HR', 80000, 0), (5, 6, 'Shavonda Mcmahon', '1989-10-15', 'F', 'P.O. Box 81591, Tarpon Springs, Montana, 37057', 'shavonda

In [127]:
df_students_data.to_csv('students_data.csv', index=False)