In [2]:
import pandas as pd
import numpy as np
import random

## Create DataFrames

pd.read_csv(filename) | From a CSV file

pd.read_table(filename) | From a delimited text file (like TSV)

pd.read_excel(filename) | From an Excel file

pd.read_sql(query, connection_object) | Read from a SQL table/database

pd.read_json(json_string) | Read from a JSON formatted string, URL or file.

pd.read_html(url) | Parses an html URL, string or file and extracts tables to a list of 
dataframes

pd.read_clipboard() | Takes the contents of your clipboard and passes it to read_table()

pd.DataFrame(dict) | From a dict, keys for columns names, values for data as lists

In [5]:
# Loading a data frame from a CSV
salesDataDf = pd.read_csv("SampleData/SalesData.csv",low_memory=False)
print(salesDataDf)

    Region  Country Product  WK_1  WK_2  WK_3  Wk_4
0  America      USA  Laptop  1241  1160  1929  1174
1  America      USA   Phone  1098  1092  1089  1819
2  America   Canada  Laptop  1441  1099  1950  1394
3  America   Canada   Phone  1990  1057  1656  1060
4   Europe  Belgium  Laptop  1084  1116  1002  1566
5   Europe  Belgium   Phone  1574  1958  1793  1213
6   Europe  Finland  Laptop  1325  1374  1300  1579
7   Europe  Finland   Phone  1347  1736  1782  1921


In [4]:
salesdf1 = pd.read_excel("SampleData/SalesData.xlsx")
display(salesdf1)

Unnamed: 0,Region,Country,Product,WK_1,WK_2,WK_3,Wk_4
0,America,USA,Laptop,1241,1160,1929,1174
1,America,USA,Phone,1098,1092,1089,1819
2,America,Canada,Laptop,1441,1099,1950,1394
3,America,Canada,Phone,1990,1057,1656,1060
4,Europe,Belgium,Laptop,1084,1116,1002,1566
5,Europe,Belgium,Phone,1574,1958,1793,1213
6,Europe,Finland,Laptop,1325,1374,1300,1579
7,Europe,Finland,Phone,1347,1736,1782,1921


In [6]:
#Create a data frame from list of dictionaries
regionCountry = {"AMR":["USA","Canada"],"EUROPE":["Austria","Belgium"]}
cars = ["Audi","BMW","Mercedes"]
months = ["Jan","Feb","Mar"]
salesData = []
for region,countries in regionCountry.items():
    for country in countries:
        for car in cars:
            salesRecord = {"region":region,"country":country,"car":car}
            for month in months:
                salesRecord[month] = random.randint(100,2000)
             
            salesData.append(salesRecord)
            
df1 = pd.DataFrame(salesData)
print(df1)

    region  country       car   Jan   Feb   Mar
0      AMR      USA      Audi  1269  1234  1113
1      AMR      USA       BMW  1595   371  1074
2      AMR      USA  Mercedes  1281   722  1600
3      AMR   Canada      Audi   765   960  1491
4      AMR   Canada       BMW   562  1889  1832
5      AMR   Canada  Mercedes   349   884  1799
6   EUROPE  Austria      Audi   191  1600  1817
7   EUROPE  Austria       BMW  1172  1282   461
8   EUROPE  Austria  Mercedes   392  1019  1575
9   EUROPE  Belgium      Audi   655  1278   463
10  EUROPE  Belgium       BMW  1016  1255   502
11  EUROPE  Belgium  Mercedes   332   720   708


In [3]:
# Create DataFrame from a List
seasons = ['Winter','Spring','Summer','Fall']
df2 = pd.DataFrame(seasons)
print(df2)

        0
0  Winter
1  Spring
2  Summer
3    Fall


In [10]:
# Create a DataFrame from an Array of Array
data = [[100,200,300],
        [400,500,600],
        ['John','Jane','Mary','Jin']
       ]
df3 = pd.DataFrame(data,index=[1,2,3],columns=['a','b','c','d'])
print(df3)

      a     b     c     d
1   100   200   300  None
2   400   500   600  None
3  John  Jane  Mary   Jin


In [13]:
# Create a multi index data frame
df = pd.DataFrame(
{
    "a":[100,200,300],
    "b":[200,300,400],
    "c":[400,500,600]
}, index = pd.MultiIndex.from_tuples(
[('d', 1), ('d', 2),
          ('e', 2)], names=['n', 'v']))
print(df)

       a    b    c
n v               
d 1  100  200  400
  2  200  300  500
e 2  300  400  600


In [7]:
# Creating a DataFrame by passing an arrays of values and an array of column header names:
df4 = pd.DataFrame([['Debasis Das',100],['John Doe',98],['Jane Doe',93]], columns = ["name","score"],dtype = float)
print(df4)

          name  score
0  Debasis Das  100.0
1     John Doe   98.0
2     Jane Doe   93.0


In [8]:
# Create a DataFrame from a Dictionary:
dictionary = {"names":["John Doe","Jane Doe","Mary Jane"], "score":[90,91,93]}
df5 = pd.DataFrame(dictionary)
print(df5)


       names  score
0   John Doe     90
1   Jane Doe     91
2  Mary Jane     93


In [31]:
# Create a DataFrame from a List of Dictionaries:

listOfDict = [{"names":"John Doe", "age":30},{"names":"Jane Doe", "age":10, "score":98.0}]
df6 = pd.DataFrame(listOfDict)
display(df6)

Unnamed: 0,names,age,score
0,John Doe,30,
1,Jane Doe,10,98.0


In [9]:
# Add a column to the DataFrame based on a condition:

df7 = pd.DataFrame({'Score':[100,20,30,80,90]})
print(df7)
df7.loc[df7.Score <= 80, "Grade"] = "B"
df7.loc[df7.Score > 80, "Grade"] = "A"
df7.loc[df7.Score < 35, "Grade"] = "F"
print(df7)


   Score
0    100
1     20
2     30
3     80
4     90
   Score Grade
0    100     A
1     20     F
2     30     F
3     80     B
4     90     A


In [11]:
# Creating a DataFrame from a numpy array:

df8 = pd.DataFrame(np.random.randint(low=90, high=100), columns=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday'],index=['Temp Morning', 'Temp Afternoon', 'Temp Evening'])
print(df8)

                Monday  Tuesday  Wednesday  Thursday  Friday
Temp Morning        98       98         98        98      98
Temp Afternoon      98       98         98        98      98
Temp Evening        98       98         98        98      98


In [12]:
# Creating a DataFrame with Random Values in few columns

salesData = {"Region":["Americas","Americas","Americas","Americas","Europe","Europe","Europe","Europe"],
 "Country":["USA","USA","Mexico","Mexico","Belgium","Belgium","Finland","Finland"],
 "Product":["Phone","TV","Phone","TV","Phone","TV","Phone","TV"],
 "wk1":np.random.uniform(1000,2000,8),
 "wk2":np.random.uniform(1000,2000,8),
 "wk3":np.random.uniform(1000,2000,8)
 }
salesDF = pd.DataFrame(salesData)
print(salesDF)


     Region  Country Product          wk1          wk2          wk3
0  Americas      USA   Phone  1160.968594  1388.322110  1412.185520
1  Americas      USA      TV  1794.859113  1770.758286  1125.346636
2  Americas   Mexico   Phone  1952.839220  1623.881235  1621.388295
3  Americas   Mexico      TV  1426.614261  1343.331160  1871.506235
4    Europe  Belgium   Phone  1711.601714  1379.586115  1510.875070
5    Europe  Belgium      TV  1088.516800  1232.987273  1705.042338
6    Europe  Finland   Phone  1379.155027  1024.696066  1295.029629
7    Europe  Finland      TV  1219.446740  1963.107658  1030.649401


In [13]:
# Adding New Columns to an Existing DataFrame:

salesDF["Total"] = salesDF['wk1'] + salesDF['wk2'] + salesDF['wk3']
salesDF["Total (K)"] = salesDF["Total"] /1000
print(salesDF)


     Region  Country Product          wk1          wk2          wk3  \
0  Americas      USA   Phone  1160.968594  1388.322110  1412.185520   
1  Americas      USA      TV  1794.859113  1770.758286  1125.346636   
2  Americas   Mexico   Phone  1952.839220  1623.881235  1621.388295   
3  Americas   Mexico      TV  1426.614261  1343.331160  1871.506235   
4    Europe  Belgium   Phone  1711.601714  1379.586115  1510.875070   
5    Europe  Belgium      TV  1088.516800  1232.987273  1705.042338   
6    Europe  Finland   Phone  1379.155027  1024.696066  1295.029629   
7    Europe  Finland      TV  1219.446740  1963.107658  1030.649401   

         Total  Total (K)  
0  3961.476224   3.961476  
1  4690.964035   4.690964  
2  5198.108749   5.198109  
3  4641.451657   4.641452  
4  4602.062898   4.602063  
5  4026.546412   4.026546  
6  3698.880722   3.698881  
7  4213.203799   4.213204  


In [14]:
# Creating a DataFrame using Assign Function
# Assign returns a copy of the data frame as a new object with the new columns added to the original data frame.
salesDF1 = salesDF[["Region","Country","Product","wk1"]]
salesDF2 = salesDF1.assign(week1_k=lambda x: salesDF1['wk1']/1000)
print(salesDF2)


     Region  Country Product          wk1   week1_k
0  Americas      USA   Phone  1160.968594  1.160969
1  Americas      USA      TV  1794.859113  1.794859
2  Americas   Mexico   Phone  1952.839220  1.952839
3  Americas   Mexico      TV  1426.614261  1.426614
4    Europe  Belgium   Phone  1711.601714  1.711602
5    Europe  Belgium      TV  1088.516800  1.088517
6    Europe  Finland   Phone  1379.155027  1.379155
7    Europe  Finland      TV  1219.446740  1.219447


## Create a DataFrame by merging two DataFrames

In [17]:
data1 = {"Country":["USA","Mexico","Brazil"],"Jan_Sales":[1000,2000,3000]}
data2 = {"Country":["Canada","Mexico","Brazil","Belgium"],"Feb_Sales":[4000,5000,6000,7000]}
df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)
#Pandas merge function can automatically detect which columns are common between the dataframes 
#Pandas will use the common columns to merge the two dataframes
print(df1)
print(df2)

  Country  Jan_Sales
0     USA       1000
1  Mexico       2000
2  Brazil       3000
   Country  Feb_Sales
0   Canada       4000
1   Mexico       5000
2   Brazil       6000
3  Belgium       7000


In [18]:
#Inner Merge
df4 = df1.merge(df2,how='inner')
print(df4)


  Country  Jan_Sales  Feb_Sales
0  Mexico       2000       5000
1  Brazil       3000       6000


In [19]:
#Outer Merge
df5 = df1.merge(df2,how='outer')
print(df5)

   Country  Jan_Sales  Feb_Sales
0      USA     1000.0        NaN
1   Mexico     2000.0     5000.0
2   Brazil     3000.0     6000.0
3   Canada        NaN     4000.0
4  Belgium        NaN     7000.0


In [20]:
#Left Merge
df6 = df1.merge(df2,how='left')
print(df6)

  Country  Jan_Sales  Feb_Sales
0     USA       1000        NaN
1  Mexico       2000     5000.0
2  Brazil       3000     6000.0


In [21]:
#Right Merge
df7 = df1.merge(df2,how='right')
print(df7)

   Country  Jan_Sales  Feb_Sales
0   Canada        NaN       4000
1   Mexico     2000.0       5000
2   Brazil     3000.0       6000
3  Belgium        NaN       7000
