## Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import calendar
import calendar
from datetime import datetime

In [2]:
from sklearn import linear_model
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVR
from sklearn.ensemble import VotingRegressor

In [3]:
from pylab import rcParams
rcParams['figure.figsize'] = 12, 8
%matplotlib inline

In [4]:
np.set_printoptions(precision=2)

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Load data

In [6]:
df_sales_daily = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Projects/Pharma Sales/salesdaily.csv')

In [16]:
df_sales_monthly = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Projects/Pharma Sales/salesmonthly.csv')

In [30]:
df_sales_hourly = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Projects/Pharma Sales/saleshourly.csv')

In [7]:
df_sales_daily.head()

Unnamed: 0,datum,M01AB,M01AE,N02BA,N02BE,N05B,N05C,R03,R06,Year,Month,Hour,Weekday Name
0,1/2/2014,0.0,3.67,3.4,32.4,7.0,0.0,0.0,2.0,2014,1,248,Thursday
1,1/3/2014,8.0,4.0,4.4,50.6,16.0,0.0,20.0,4.0,2014,1,276,Friday
2,1/4/2014,2.0,1.0,6.5,61.85,10.0,0.0,9.0,1.0,2014,1,276,Saturday
3,1/5/2014,4.0,3.0,7.0,41.1,8.0,0.0,3.0,0.0,2014,1,276,Sunday
4,1/6/2014,5.0,1.0,4.5,21.7,16.0,2.0,6.0,2.0,2014,1,276,Monday


## Group by the 2nd drug by Weekday

In [11]:
df_sales_daily = df_sales_daily[['M01AE', 'Weekday Name']]
result = df_sales_daily.groupby(['Weekday Name'], as_index=False).sum().sort_values('M01AE', ascending=False)

In [13]:
result_day = result.iloc[0,0]

In [14]:
result_value = round(result.iloc[0,1], 2)

In [15]:
print('The second drug, M01AE, was most frequently sold on ' + str(result_day) 
      + ' with the volume of ' + str(result_value))

The second drug, M01AE, was most frequently sold on Sunday with the volume of 1384.94


## Top 3 drugs gaving the largest sales in January 2015, July 2016, September 2017

In [17]:
df_sales_monthly.head()

Unnamed: 0,datum,M01AB,M01AE,N02BA,N02BE,N05B,N05C,R03,R06
0,2014-01-31,127.69,99.09,152.1,878.03,354.0,50.0,112.0,48.2
1,2014-02-28,133.32,126.05,177.0,1001.9,347.0,31.0,122.0,36.2
2,2014-03-31,137.44,92.95,147.655,779.275,232.0,20.0,112.0,85.4
3,2014-04-30,113.1,89.475,130.9,698.5,209.0,18.0,97.0,73.7
4,2014-05-31,101.79,119.933,132.1,628.78,270.0,23.0,107.0,123.7


In [28]:
def top_3_drugs_by_month(month, year):
    month = str(month) if (month > 9) else '0' + str(month)
    year = str(year)

    # Filter by Date
    sales = df_sales_monthly.loc[df_sales_monthly['datum'].str.contains('^' + year + '\-' + month + '' , flags=re.I, regex=True)]
    
    # Reset index
    sales = sales.reset_index()
    
    # Filter chosen drugs
    top_Sales_by_Product = sales[['M01AB', 'M01AE', 'N02BA', 'N02BE', 'N05B', 'N05C', 'R03', 'R06']]
    
    # Sort values horizontally in descending order
    top_Sales_by_Product = top_Sales_by_Product.sort_values(by=0, ascending=False, axis=1)
    
    # Print results
    print('Top 3 Drugs by Sale in ' + calendar.month_name[int(month)] + ' ' + year)
    for drug in top_Sales_by_Product.columns.values[0:3]:
        print(' - Product: ' + str(drug) + ', Volume sold: ' + str(round(top_Sales_by_Product[drug].iloc[0], 2)))
    print("\n")

In [29]:
# Top 3 Drugs by Sale in January 2015
top_3_drugs_by_month(1, 2015)

# Top 3 Drugs by Sale in July 2016
top_3_drugs_by_month(7, 2016)

# Top 3 Drugs by Sale in September 2017
top_3_drugs_by_month(9, 2017)

Top 3 Drugs by Sale in January 2015
 - Product: N02BE, Volume sold: 1044.24
 - Product: N05B, Volume sold: 463.0
 - Product: R03, Volume sold: 177.25


Top 3 Drugs by Sale in July 2016
 - Product: N02BE, Volume sold: 652.36
 - Product: N05B, Volume sold: 240.0
 - Product: M01AB, Volume sold: 203.97


Top 3 Drugs by Sale in September 2017
 - Product: N02BE, Volume sold: 863.75
 - Product: N05B, Volume sold: 223.0
 - Product: R03, Volume sold: 139.0




## Top 1 Drug sold on Mondays in 2017

In [33]:
df_sales_daily = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Projects/Pharma Sales/salesdaily.csv')

In [34]:
df_sales_daily.head()

Unnamed: 0,datum,M01AB,M01AE,N02BA,N02BE,N05B,N05C,R03,R06,Year,Month,Hour,Weekday Name
0,1/2/2014,0.0,3.67,3.4,32.4,7.0,0.0,0.0,2.0,2014,1,248,Thursday
1,1/3/2014,8.0,4.0,4.4,50.6,16.0,0.0,20.0,4.0,2014,1,276,Friday
2,1/4/2014,2.0,1.0,6.5,61.85,10.0,0.0,9.0,1.0,2014,1,276,Saturday
3,1/5/2014,4.0,3.0,7.0,41.1,8.0,0.0,3.0,0.0,2014,1,276,Sunday
4,1/6/2014,5.0,1.0,4.5,21.7,16.0,2.0,6.0,2.0,2014,1,276,Monday


### Filter out all except Mondays in 2017

In [35]:
df_sales_daily = df_sales_daily.loc[df_sales_daily['datum'].str.contains('2017', flags=re.I, regex=True) & (df_sales_daily['Weekday Name'] == 'Monday')]

### Group by Weekday and Sum up

In [36]:
df_sales_daily = df_sales_daily.groupby(['Weekday Name'], as_index=False).sum()

### Filter by the chosen Drugs and Sort by descending order

In [37]:
df_sales_daily = df_sales_daily[['M01AB', 'M01AE', 'N02BA', 'N02BE', 'N05B', 'N05C', 'R03', 'R06']]
result = df_sales_daily.sort_values(by=0, ascending=False, axis=1)

In [46]:
for field in result.columns.values[0:1]:
    print('The most frequently sold Drug on Mondays in 2017 is ' + str(field) + ' with the Volume of ' + str(round(result[field].iloc[0], 2)))

The most frequently sold Drug on Mondays in 2017 is N02BE with the Volume of 1160.56
