# Transaction Analysis

In [1]:
# import pandas as pd
# import numpy as np
# import matplotlib as plt
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Input file to be processed

In [2]:
# import data from .CSV file
df = pd.read_csv('transaction_data.csv')

In [3]:
# review data
df

Unnamed: 0,TransactionID,CustID,Date,Employee,ProductID,ProductName,VendorID,ProductVendor,UnitPrice,Quantity,Subtotal,Tax(8%),OrderTotal
0,10258,1,7/17/2023,Nancy Davolio,32,Mascarpone Fabioli,14,Formaggi Fortini s.r.l.,25.6,6,153.6,12.29,165.89
1,10275,2,8/7/2023,Nancy Davolio,24,Guaran Fantstica,10,Refrescos Americanas LTDA,3.6,12,43.2,3.46,46.66
2,10275,2,8/7/2023,Nancy Davolio,59,Raclette Courdavault,28,Gai pturage,44.0,6,264.0,21.12,285.12
3,10292,3,8/28/2023,Nancy Davolio,20,Sir Rodney's Marmalade,8,"Specialty Biscuits, Ltd.",64.8,20,1296.0,103.68,1399.68
4,10293,4,8/29/2023,Nancy Davolio,18,Carnarvon Tigers,7,"Pavlova, Ltd.",50.0,12,600.0,48.00,648.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1348,11016,20,4/10/2025,Anne Dodsworth,31,Gorgonzola Telino,14,Formaggi Fortini s.r.l.,12.5,15,187.5,15.00,202.50
1349,11016,20,4/10/2025,Anne Dodsworth,36,Inlagd Sill,17,Svensk Sjfda AB,19.0,16,304.0,24.32,328.32
1350,11058,82,4/29/2025,Anne Dodsworth,21,Sir Rodney's Scones,8,"Specialty Biscuits, Ltd.",10.0,3,30.0,2.40,32.40
1351,11058,82,4/29/2025,Anne Dodsworth,60,Camembert Pierrot,28,Gai pturage,34.0,21,714.0,57.12,771.12


In [4]:
# review info of dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1353 entries, 0 to 1352
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   TransactionID  1353 non-null   int64  
 1   CustID         1353 non-null   int64  
 2   Date           1353 non-null   object 
 3   Employee       1353 non-null   object 
 4   ProductID      1353 non-null   int64  
 5   ProductName    1353 non-null   object 
 6   VendorID       1353 non-null   int64  
 7   ProductVendor  1353 non-null   object 
 8   UnitPrice      1353 non-null   float64
 9   Quantity       1353 non-null   int64  
 10  Subtotal       1353 non-null   float64
 11  Tax(8%)        1353 non-null   float64
 12  OrderTotal     1353 non-null   float64
dtypes: float64(4), int64(5), object(4)
memory usage: 137.5+ KB


After reviewing the data, I have determined some columns may not be necessary for analysis and the data must be broken up into multiple tables to avoid misleading analysis based on duplicates.

### Dataset normalization

In [5]:
# remove the column Tax(8%)
df = df.drop(columns=['Tax(8%)'])
df

Unnamed: 0,TransactionID,CustID,Date,Employee,ProductID,ProductName,VendorID,ProductVendor,UnitPrice,Quantity,Subtotal,OrderTotal
0,10258,1,7/17/2023,Nancy Davolio,32,Mascarpone Fabioli,14,Formaggi Fortini s.r.l.,25.6,6,153.6,165.89
1,10275,2,8/7/2023,Nancy Davolio,24,Guaran Fantstica,10,Refrescos Americanas LTDA,3.6,12,43.2,46.66
2,10275,2,8/7/2023,Nancy Davolio,59,Raclette Courdavault,28,Gai pturage,44.0,6,264.0,285.12
3,10292,3,8/28/2023,Nancy Davolio,20,Sir Rodney's Marmalade,8,"Specialty Biscuits, Ltd.",64.8,20,1296.0,1399.68
4,10293,4,8/29/2023,Nancy Davolio,18,Carnarvon Tigers,7,"Pavlova, Ltd.",50.0,12,600.0,648.00
...,...,...,...,...,...,...,...,...,...,...,...,...
1348,11016,20,4/10/2025,Anne Dodsworth,31,Gorgonzola Telino,14,Formaggi Fortini s.r.l.,12.5,15,187.5,202.50
1349,11016,20,4/10/2025,Anne Dodsworth,36,Inlagd Sill,17,Svensk Sjfda AB,19.0,16,304.0,328.32
1350,11058,82,4/29/2025,Anne Dodsworth,21,Sir Rodney's Scones,8,"Specialty Biscuits, Ltd.",10.0,3,30.0,32.40
1351,11058,82,4/29/2025,Anne Dodsworth,60,Camembert Pierrot,28,Gai pturage,34.0,21,714.0,771.12


For my analysis I did not see a use for the 'Tax(8%)' column.

Next I will use the .groupby() method to group data together creating new DataFrames to pull insights from those DataFrames.

In [13]:
# groupby Employee sum of quantity, subtotal, and ordertotal
dfE = df.groupby(['Employee']).sum(['Quantity', 'UnitPrice', 'OrderTotal'])
dfE

Unnamed: 0_level_0,TransactionID,CustID,ProductID,VendorID,UnitPrice,Quantity,Subtotal,OrderTotal
Employee,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Andrew Fuller,1554078,6516,5673,2079,4042.43,1829,49583.21,53549.87
Anne Dodsworth,665659,2646,2334,823,2038.56,822,25215.25,27232.47
Janet Leverling,2086955,8576,7923,2807,4833.81,2552,57347.97,61935.82
Laura Callahan,1828669,6503,7084,2439,4291.63,2250,57014.73,61575.87
Margaret Peacock,2815855,10395,10476,3848,7411.25,3308,94531.5,102093.97
Michael Suyama,1257471,5074,4833,1711,2737.12,1532,34899.51,37691.47
Nancy Davolio,2364681,5253,9287,3261,5765.38,2495,69532.81,75095.44
Robert King,1075497,4398,3977,1441,2374.41,1299,29596.6,31964.33
Steven Buchanan,775711,3328,3060,964,1560.15,1004,21377.75,23087.99


For this table I want specific insights of quantity sold, and revenue generated by employees so I will drop all but 2 columns here.

In [14]:
dfE = dfE.drop(columns=['TransactionID','CustID','ProductID', 'VendorID', 'Subtotal', 'UnitPrice'])
dfE

Unnamed: 0_level_0,Quantity,OrderTotal
Employee,Unnamed: 1_level_1,Unnamed: 2_level_1
Andrew Fuller,1829,53549.87
Anne Dodsworth,822,27232.47
Janet Leverling,2552,61935.82
Laura Callahan,2250,61575.87
Margaret Peacock,3308,102093.97
Michael Suyama,1532,37691.47
Nancy Davolio,2495,75095.44
Robert King,1299,31964.33
Steven Buchanan,1004,23087.99


Next I want to seperate sales data from my main dataset based on vender information using the groupby() method.

In [10]:
# groupby() VendorID and ProductID
dfV = df.groupby(['VendorID', 'ProductID']).sum(['Quantity', 'UnitPrice', 'OrderTotal'])
dfV

Unnamed: 0_level_0,Unnamed: 1_level_0,TransactionID,CustID,UnitPrice,Quantity,Subtotal,OrderTotal
VendorID,ProductID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1,277515,1164,439.20,313,5169.60,5583.17
1,2,312283,965,532.00,407,7425.20,8019.21
1,3,64361,201,58.00,84,800.00,864.00
2,4,127246,450,242.00,163,3198.80,3454.70
2,5,52853,207,98.05,71,1376.65,1486.78
...,...,...,...,...,...,...,...
27,58,74941,246,90.10,95,1219.00,1316.52
28,59,286295,1119,1375.00,301,15257.00,16477.56
28,60,288805,1070,870.40,373,11934.00,12888.73
29,61,172597,739,450.30,187,5238.30,5657.36
