<div style="text-align:center;font-size:22pt; font-weight:bold;color:white;border:solid black 1.5pt;background-color:#1e7263;">
    Basic Data Structures with Polars  <br> 
    DataFrames
</div>

In [2]:
# ============================================================
#                                                            =
#             Title: Basic Data Structures with Polars       =
#                             Data Frames                    =
#             ---------------------------------              =
#                                                            =
#             Author: Dr. Saad Laouadi                       =
#                                                            =
#             Copyright: Dr. Saad Laouadi                    =
# ============================================================
#                                                            =
#                       LICENSE                              =
#             ----------------------                         =
#                                                            =
#             This material is intended for educational      =
#             purposes only and may not be used directly in  =
#             courses, video recordings, or similar          =
#             without prior consent from the author.         =
#             When using or referencing this material,       =
#             proper credit must be attributed to the        =
#             author.                                        =
# ============================================================

In [4]:
# Environment Setup
import sys
sys.path.append('../../scripts/')  

# import the working libraries
from importlibs import *
from utils import install_faker, update_pip

In [5]:
update_pip()
install_faker()

pip is already up-to-date.
Faker is already installed. Version: 22.0.0


## Creating DataFrames with `DataFrame` Constructor 

In [10]:
# Creating column names (Variables)
cols = ["Var_" + str(num) for num in range(1, 5)]

# Generating Random Data
np.random.seed(22)
my_data = np.random.normal(size = 60).reshape(15, 4)

dtypes = [pl.Float32, pl.Float64, pl.Float32, pl.Float64]
schema = {colname:coltype for (colname, coltype) in zip(cols, dtypes)}

df = pl.DataFrame(data = my_data, schema=schema)
df.head() 

Var_1,Var_2,Var_3,Var_4
f32,f64,f32,f64
-0.09195,-1.463351,1.081792,-0.239325
-0.491129,-1.002272,0.918822,-1.103632
0.626493,-0.561514,0.028855,-0.230767
0.587752,0.752318,-1.058503,1.055972
0.74775,1.064677,1.52013,-1.488603


In [11]:
# Examine the type of the df object
print(type(df))

<class 'polars.dataframe.frame.DataFrame'>


In [15]:
df.schema

OrderedDict([('Var_1', Float32),
             ('Var_2', Float64),
             ('Var_3', Float32),
             ('Var_4', Float64)])

In [12]:
# Create a random dictionary 
np.random.seed(2)
data = {'Var1': list(np.round(np.random.normal(loc = 14, scale=3.7, size = 6), 2)), 
       'Var2': list(np.random.poisson(lam = 3, size = 6)*10), 
       'Var3': list(np.random.binomial(n = 5, p = 0.5, size = 6))}
# np.random.binomial()
# Display the dict object 
data

{'Var1': [12.46, 13.79, 6.1, 20.07, 7.36, 10.89],
 'Var2': [20, 30, 40, 20, 10, 10],
 'Var3': [3, 2, 1, 2, 2, 2]}

In [13]:
# Construct a dataframe object from the dict object
df_from_dict = pl.DataFrame(data)
df_from_dict

Var1,Var2,Var3
f64,i64,i64
12.46,20,3
13.79,30,2
6.1,40,1
20.07,20,2
7.36,10,2
10.89,10,2


In [14]:
df_from_dict.schema

OrderedDict([('Var1', Float64), ('Var2', Int64), ('Var3', Int64)])

In [7]:
# Generate fake data 
from faker import Faker
fake = Faker()


# Set the seed
fake.seed_instance(11)

# Generate a random data in a dict object
data = {"Cities": [fake.city() for i in range(6)],
        "Countries": [fake.country() for i in range(6)],
        "Country_codes": [fake.country_code() for i in range(6)]
       }

# Construct a data frame object 
data_df = pl.DataFrame(data)

# Display the data frame object
print(data_df)

shape: (6, 3)
┌─────────────────┬──────────────────────────────────┬───────────────┐
│ Cities          ┆ Countries                        ┆ Country_codes │
│ ---             ┆ ---                              ┆ ---           │
│ str             ┆ str                              ┆ str           │
╞═════════════════╪══════════════════════════════════╪═══════════════╡
│ Glennmouth      ┆ Palau                            ┆ AL            │
│ Romeroshire     ┆ Saint Vincent and the Grenadines ┆ SM            │
│ East Linda      ┆ Nicaragua                        ┆ BI            │
│ Sotoburgh       ┆ Pakistan                         ┆ BH            │
│ South Kevinland ┆ Cayman Islands                   ┆ AZ            │
│ New Tinamouth   ┆ Nigeria                          ┆ ET            │
└─────────────────┴──────────────────────────────────┴───────────────┘


In [9]:
print(data_df.schema)

OrderedDict({'Cities': Utf8, 'Countries': Utf8, 'Country_codes': Utf8})


In [12]:
# Generate a random list of lists
import random
fake.seed_instance(11)
lst_of_lsts = [
    fake.pylist(nb_elements=10,
                variable_nb_elements = False, 
                value_types='int') for i in range(5)]


# Generate random Keys
random.seed(11)
keys = fake.words(nb = 5)
keys = [key.title() for key in keys]

# Create a dictionary whose values are lists
d_lst = {k:v for k, v in zip(keys, lst_of_lsts)}

In [13]:
d_lst

{'Politics': [9171, 7402, 3025, 3050, 7316, 2323, 8825, 9755, 7421, 245],
 'Beyond': [975, 3116, 9824, 7601, 7217, 8505, 4819, 75, 7492, 6664],
 'Writer': [4161, 3762, 487, 9226, 6560, 4766, 1094, 8, 3436, 7700],
 'Degree': [6511, 1196, 4420, 1427, 5449, 6718, 2205, 1655, 981, 7976],
 'Election': [9163, 7330, 2145, 6287, 6469, 3487, 4420, 321, 3068, 9863]}

In [14]:
pl.DataFrame(d_lst)

Politics,Beyond,Writer,Degree,Election
i64,i64,i64,i64,i64
9171,975,4161,6511,9163
7402,3116,3762,1196,7330
3025,9824,487,4420,2145
3050,7601,9226,1427,6287
7316,7217,6560,5449,6469
2323,8505,4766,6718,3487
8825,4819,1094,2205,4420
9755,75,8,1655,321
7421,7492,3436,981,3068
245,6664,7700,7976,9863


In [18]:
# Generate random data
# Set the seed
fake.seed_instance(21)

# Generate a random dict object 
rnd_dict = fake.pydict(nb_elements = 5, 
                       variable_nb_elements = False, 
                       value_types='int')

# Construct a dataframe
rnd_df = pl.DataFrame(rnd_dict)

# Display the dataframe object
print(rnd_df)

shape: (1, 5)
┌───────────┬──────┬──────┬───────┬────────┐
│ community ┆ kid  ┆ road ┆ prove ┆ father │
│ ---       ┆ ---  ┆ ---  ┆ ---   ┆ ---    │
│ i64       ┆ i64  ┆ i64  ┆ i64   ┆ i64    │
╞═══════════╪══════╪══════╪═══════╪════════╡
│ 3539      ┆ 8396 ┆ 8275 ┆ 52    ┆ 6079   │
└───────────┴──────┴──────┴───────┴────────┘


In [16]:
# Generate random data
fake.seed_instance(21)
rnd_dict = {k:v for k, v in zip([fake.name() for i in range(6)],
                      [fake.pylist(nb_elements=10, variable_nb_elements=False, 
                                  value_types='int') for i in range(10)])}

rnd_df = pl.DataFrame(rnd_dict)
rnd_df

Michael Reid,Daniel Sparks,Kayla Wagner,Aaron Watson,Andrea Leon,David Williams
i64,i64,i64,i64,i64,i64
7239,3848,6333,4026,2659,809
5389,3233,439,2763,7196,2287
1897,9515,2604,2851,7916,463
380,2006,46,6141,4718,5992
1449,9157,1148,601,4226,6041
320,3294,9363,6668,569,7937
2639,2394,9117,9675,4072,5045
6575,4783,2960,5678,6463,8754
2475,7394,5705,8292,5539,2282
608,5209,5678,4050,2477,6264


In [20]:
data = dict(ocuntry = list(np.repeat("England", repeats = 3)) + \
                        list(np.repeat("Japan", repeats = 2)) + \
                        list(np.repeat("Germany", repeats = 5)),
            Year = [*range(2018, 2021)] + \
                    [*range(2018, 2020)] + \
                    [*range(2018, 2023)],
            GPD = list(np.linspace(2000, 3000, num = 3 )) + \
                  list(np.linspace(5000, 6000, num = 2 )) + \
                  list(np.linspace(4000, 6000, num = 5 )))

# Create a data frame
df = pl.DataFrame(data)

# Print the data frame
print(df)

shape: (10, 3)
┌─────────┬──────┬────────┐
│ ocuntry ┆ Year ┆ GPD    │
│ ---     ┆ ---  ┆ ---    │
│ str     ┆ i64  ┆ f64    │
╞═════════╪══════╪════════╡
│ England ┆ 2018 ┆ 2000.0 │
│ England ┆ 2019 ┆ 2500.0 │
│ England ┆ 2020 ┆ 3000.0 │
│ Japan   ┆ 2018 ┆ 5000.0 │
│ …       ┆ …    ┆ …      │
│ Germany ┆ 2019 ┆ 4500.0 │
│ Germany ┆ 2020 ┆ 5000.0 │
│ Germany ┆ 2021 ┆ 5500.0 │
│ Germany ┆ 2022 ┆ 6000.0 │
└─────────┴──────┴────────┘


## Creating Polars DataFrame from Pandas DataFrame 

In [29]:
fake.seed_instance(2)
data = [fake.profile() for _ in range(10)]
data = pd.DataFrame(data)


pl.DataFrame(data)

job,company,ssn,residence,current_location,blood_group,website,username,name,sex,address,mail,birthdate
str,str,str,str,list[f64],str,list[str],str,str,str,str,str,date
"""Banker""","""Miller and Son…","""754-86-5049""","""092 Little Uni…","[-80.356638, -165.265656]","""AB-""","[""https://snow-fox.net/"", ""http://www.johnson.com/"", … ""https://pollard-hayden.com/""]","""framirez""","""Joel Moreno""","""M""","""775 Tucker For…","""ncollins@yahoo…",1967-05-24
"""Youth worker""","""Mueller LLC""","""744-02-3136""","""Unit 1090 Box …","[-18.638796, -48.556833]","""B-""","[""https://www.cox.com/""]","""watsondavid""","""Tiffany Johnso…","""F""","""USNS Mills FPO…","""bauerjohn@gmai…",1908-07-01
"""Warden/ranger""","""Newton, Wilson…","""032-40-7351""","""6927 Clarence …","[49.126367, 134.045698]","""O+""","[""https://www.green-obrien.biz/"", ""http://www.brown.com/"", … ""http://king.com/""]","""hernandezpaul""","""Jay Fox""","""M""","""11023 Jeff Pik…","""gailhodges@gma…",1992-07-27
"""Politician's a…","""Hunt-Perkins""","""053-54-8603""","""750 Cheryl Hig…","[-7.879592, -73.489453]","""A+""","[""http://tucker.com/"", ""http://watson.com/"", … ""https://www.foster-martinez.com/""]","""heatherjohnson…","""Sarah Davidson…","""F""","""745 Thomas Lan…","""rfrederick@yah…",1953-10-17
"""Designer, grap…","""Carlson-Dunn""","""713-87-1382""","""52826 Lauren S…","[-1.573427, 55.603566]","""B+""","[""http://www.hatfield.com/"", ""http://jacobson.com/"", ""https://daniel.org/""]","""qhill""","""Christopher Di…","""M""","""758 Reeves Sho…","""edwardsalexand…",1983-06-04
"""Chief Marketin…","""Powell and Son…","""747-73-9936""","""68349 Tracy Ri…","[-21.900311, 95.651652]","""B+""","[""http://www.smith-washington.com/"", ""https://adams-shaw.com/"", … ""http://www.porter.com/""]","""austin69""","""Kristin Rasmus…","""F""","""2784 Jonathan …","""laurenfuller@y…",1912-10-06
"""Programmer, sy…","""Hall, Ramirez …","""115-87-6214""","""5157 Jackson M…","[28.647527, 163.393493]","""B-""","[""https://www.barron.com/"", ""https://www.taylor-taylor.com/"", ""https://peterson.net/""]","""amanda15""","""Paul Booth""","""M""","""45771 Leslie L…","""usmith@gmail.c…",2003-09-21
"""Therapist, mus…","""Weaver-Harriso…","""155-47-5184""","""23542 Ward Cor…","[-42.359901, -146.144115]","""O+""","[""https://www.mills-bruce.com/"", ""https://www.long.info/"", … ""https://www.cruz-chavez.com/""]","""longstephen""","""Mr. Timothy Fr…","""M""","""53202 Santiago…","""heatherweaver@…",1931-03-03
"""Accounting tec…","""Morgan, Rice a…","""381-63-0025""","""17309 Connie J…","[-46.245894, 94.325648]","""A-""","[""http://www.harris-peterson.com/"", ""https://www.edwards-evans.com/""]","""jermaine28""","""Diane Cannon""","""F""","""833 Peter Cany…","""vfernandez@gma…",2004-12-05
"""Therapist, spo…","""Johnson-Gonzal…","""111-18-8918""","""PSC 1743, Box …","[20.779751, 43.046518]","""A+""","[""https://www.park.com/"", ""https://medina-long.net/"", … ""https://salazar-gilbert.org/""]","""kyle25""","""Pedro Horn""","""M""","""99243 Garcia T…","""barnesmartha@g…",1940-10-18
