# Digital Marketing Strategy

Team 
* Paul Antony | 500938419 
* Carl Hümbs | 500927977 
* Matthijs Snijders | 500780453 
* Otto Tagapere | 500930931 

In [14]:
#Importing all necessary libraries

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## 1. Data preprocessing and cleaning

In [15]:
# Importing the 4 csv files

df_1 = pd.read_csv('https://raw.githubusercontent.com/crlhbs/Group1_MarketingStrategy/main/original%20theme%20park%20datasets/Themeparcs%20I.csv?token=GHSAT0AAAAAACPJTRO6YHOXTFAG56P57LV6ZPQHJFQ', delimiter=';')
df_2 = pd.read_csv('https://raw.githubusercontent.com/crlhbs/Group1_MarketingStrategy/main/original%20theme%20park%20datasets/Themeparcs%20II.csv?token=GHSAT0AAAAAACPJTRO7BYOUWE2CLI6AFLXMZPQHNDQ', delimiter=';')
df_3 = pd.read_csv('https://raw.githubusercontent.com/crlhbs/Group1_MarketingStrategy/main/original%20theme%20park%20datasets/Themeparcs%20III.csv?token=GHSAT0AAAAAACPJTRO6XX66HWSEJ6XWSRBMZPQHN3A', delimiter=';')
df_4 = pd.read_csv('https://raw.githubusercontent.com/crlhbs/Group1_MarketingStrategy/main/original%20theme%20park%20datasets/Themeparcs%20IV.csv?token=GHSAT0AAAAAACPJTRO7CPU4UIDQ5MBDQZIUZPQHK7A', delimiter=';')

In [16]:
# Merging the 4 csv files into 1 single table

# Concatenate the DataFrames along the rows
df = pd.concat([df_1, df_2, df_3, df_4], ignore_index=True)

# To verify the combined DataFrame
print(df.head())

# Now combined_df contains all the rows from df_1, df_2, df_3, and df_4 in a single DataFrame.

               Date                     Profile  \
0  12/31/23 9:40 PM  Wild Adventures Theme Park   
1  12/31/23 8:01 PM            Fun Spot America   
2  12/31/23 7:30 PM            Fun Spot America   
3  12/31/23 7:00 PM            Disneyland Paris   
4  12/31/23 7:00 PM       Silverwood Theme Park   

                                             Message  Number of Reactions  \
0  🎟️🚨 Today's the FINAL day to use your 2023 Sea...                 16.0   
1  See you next year! 😉                Don’t miss...                 38.0   
2  Don't miss our HUGE firework display ! 🥳\n\n#f...                 23.0   
3     Fijne jaarwisseling! 🥂🎉\nEn tot volgend jaar 😉                646.0   
4  It's the last day of our New Year's Sale! Save...                 17.0   

   Number of Shares  Number of Comments (primary)  Number of Shares.1  \
0               NaN                           2.0                 2.0   
1               NaN                           1.0                 2.0   
2          

### 1.1 Cleaning the dataset

In [17]:
# Check for missing values in the combined DataFrame
missing_values = df.isnull().sum()

# Display the count of missing values for each column
print("Missing values in each column:")
print(missing_values)

# If you want to display only the columns with missing values:
print("\nColumns with missing values:")
print(missing_values[missing_values > 0])

Missing values in each column:
Date                                0
Profile                             0
Message                             0
Number of Reactions                 0
Number of Shares                15443
Number of Comments (primary)        0
Number of Shares.1                  0
Comments per post                   0
Comments on posts by fans           0
Post interaction rate               0
Engagement                          0
Number of Likes                     0
Link-posts                      14888
Picture Posts                    3881
Video-posts                     12964
Profile-ID                          0
Post-ID                             0
Link                                0
dtype: int64

Columns with missing values:
Number of Shares    15443
Link-posts          14888
Picture Posts        3881
Video-posts         12964
dtype: int64


In [18]:
# Dropping the "Number of Shares" column as it is empty in every single row of the dataset
df.drop(columns=['Number of Shares', 'Comments on posts by fans'], inplace=True)

In [19]:
# List of columns to convert to integers
columns_to_convert = ['Number of Reactions', 'Number of Comments (primary)', 'Number of Shares.1', 
                      'Comments per post', 'Number of Likes']

# Convert float values to integers
df[columns_to_convert] = df[columns_to_convert].astype(int)

# Verify the changes
print(df.dtypes)

Date                             object
Profile                          object
Message                          object
Number of Reactions               int32
Number of Comments (primary)      int32
Number of Shares.1                int32
Comments per post                 int32
Post interaction rate            object
Engagement                       object
Number of Likes                   int32
Link-posts                      float64
Picture Posts                   float64
Video-posts                     float64
Profile-ID                       object
Post-ID                          object
Link                             object
dtype: object


In [20]:
# Remove the comma from the string and convert it to a float
df['Engagement'] = df['Engagement'].str.replace(',', '.').astype(float)

In [21]:
#for better overview, we will change the names of the columns
df.rename(columns={'Date': 'date', 'Profile': 'profile','Message': 'message', 'Number of Reactions': 'nr_reactions', 'Number of Comments (primary)': 'nr_comments', 'Number of Shares.1': 'nr_shares', 'Comments per post': 'comments/post', 'Post interaction rate': 'post_interaction_rate', 'Engagement': 'engagement', 'Number of Likes': 'nr_likes', 'Link-posts': 'link_posts', 'Picture Posts': 'picture_posts', 'Video-posts': 'video_posts','Profile-ID': 'profile_id','Post-ID': 'post_id','Link': 'link',}, inplace=True)
#display the pfofile column in small letters
df['profile'] = df['profile'].str.lower()

In [22]:
df.head(1)

Unnamed: 0,date,profile,message,nr_reactions,nr_comments,nr_shares,comments/post,post_interaction_rate,engagement,nr_likes,link_posts,picture_posts,video_posts,profile_id,post_id,link
0,12/31/23 9:40 PM,wild adventures theme park,🎟️🚨 Today's the FINAL day to use your 2023 Sea...,16,2,2,2,"1,09E+12",1090000000000.0,13,,1.0,,84679583132,84679583132_770120045161180,https://www.facebook.com/617429960430190/posts...


### **1.3 Adding the Level 2 variable**

In [23]:
#read the excel file into a dataframe
level2_df = pd.read_excel('Themepark_Followers.xlsx')
#to check the first few rows of the dataframe, you can use
print(level2_df.head())


                      Profile  Followers
0  Wild Adventures Theme Park         66
1            Fun Spot America     100000
2       Silverwood Theme Park     381000
3                Belmont Park      51000
4            SeaWorld Orlando     267000


In [24]:
#rename the columns
level2_df.rename(columns={'Followers': 'followers', 'Profile': 'profile',}, inplace=True)
#display the pfofile column in small letters
level2_df['profile'] = level2_df['profile'].str.lower()
level2_df.head(1)

Unnamed: 0,profile,followers
0,wild adventures theme park,66


In [25]:
# Merge 'combined_df' with 'level2_df' on the 'profile' column
combined_df = df.merge(level2_df, on='profile', how='left')
#show combined_df
combined_df.head()


Unnamed: 0,date,profile,message,nr_reactions,nr_comments,nr_shares,comments/post,post_interaction_rate,engagement,nr_likes,link_posts,picture_posts,video_posts,profile_id,post_id,link,followers
0,12/31/23 9:40 PM,wild adventures theme park,🎟️🚨 Today's the FINAL day to use your 2023 Sea...,16,2,2,2,"1,09E+12",1090000000000.0,13,,1.0,,84679583132,84679583132_770120045161180,https://www.facebook.com/617429960430190/posts...,66.0
1,12/31/23 8:01 PM,fun spot america,See you next year! 😉 Don’t miss...,38,1,2,1,"4,17E+11",417000000000.0,33,,,,"1,8402E+11",184020466977_767286052104076,https://www.facebook.com/653224623510220/posts...,100000.0
2,12/31/23 7:30 PM,fun spot america,Don't miss our HUGE firework display ! 🥳\n\n#f...,23,2,3,2,"2,85E+11",285000000000.0,21,,,1.0,"1,8402E+11",184020466977_767267948772553,https://www.facebook.com/653224623510220/posts...,100000.0
3,12/31/23 7:00 PM,disneyland paris,Fijne jaarwisseling! 🥂🎉\nEn tot volgend jaar 😉,646,52,12,56,"1,36E+11",136000000000.0,478,,1.0,,"6,52389E+14",652389081615295_775974521237956,https://www.facebook.com/724434559725286/posts...,
4,12/31/23 7:00 PM,silverwood theme park,It's the last day of our New Year's Sale! Save...,17,2,0,2,"4,96E+11",496000000000.0,16,,,1.0,"7,62702E+14",762701550509617_678920137770716,https://www.facebook.com/562559642740100/posts...,381000.0


### **1.4 Exporting the combined and cleaned dataset** for further analytics in separate files

In [26]:
# The combined df will be pushed to our github repository
combined_df.to_csv('combined_themepark_dataset.csv', index=False)