# ARP - 500e Data Merging

In [1]:
import pandas as pd
import numpy as np
import re

## Merge Forums Datasets

In [2]:
# Read csv file
df_speakev_clean = pd.read_csv('500e - SpeakEV_Clean.csv')
df_speakev_clean.head()

Unnamed: 0,Date,URL,Text
0,2023-07-21,https://www.speakev.com/threads/fiat-500e-icon...,hi there we will receive our new fiat 500e ico...
1,2023-08-17,https://www.speakev.com/threads/what-charging-...,hi there i have ordered a fiat 500e icon on le...
2,2022-10-22,https://www.speakev.com/threads/fiat-500e-serv...,i had my 2015 fiat 500e shipped from ca a few ...
3,2023-09-12,https://www.speakev.com/threads/thoughts-on-th...,the latest fiat 500e la prima designio by kahn...
4,2023-09-26,https://www.speakev.com/threads/fiat-500e-char...,is there a way to delete not just unselect a c...


In [3]:
# Read csv file
df_ph_clean = pd.read_csv('500e - PistonHeads_Clean.csv')
df_ph_clean.head()

Unnamed: 0,Date,URL,Text
0,2023-04-21,https://www.pistonheads.com/gassing/topic.asp?...,i think full ev is the only and last chance fo...
1,2023-04-21,https://www.pistonheads.com/gassing/topic.asp?...,range rovers of various sizes really are the d...
2,2023-04-21,https://www.pistonheads.com/gassing/topic.asp?...,thats it in a nutshellfor as long as anyone ca...
3,2023-04-21,https://www.pistonheads.com/gassing/topic.asp?...,great news just make it beautiful and you will...
4,2023-04-21,https://www.pistonheads.com/gassing/topic.asp?...,hear hear im sad to see the xj go wonder what ...


In [4]:
# Merge the two dataframes
df_combined = pd.concat([df_speakev_clean, df_ph_clean], ignore_index=True)

# 500e - Potential Customers (Merge Forums with YouTube Comments)

In [5]:
# Copy the dataframe
df_forums = df_combined.copy()
df_comments = pd.read_csv('500e - YouTube Comments_Clean.csv')

df_forums.head(), df_comments.head()

(         Date                                                URL  \
 0  2023-07-21  https://www.speakev.com/threads/fiat-500e-icon...   
 1  2023-08-17  https://www.speakev.com/threads/what-charging-...   
 2  2022-10-22  https://www.speakev.com/threads/fiat-500e-serv...   
 3  2023-09-12  https://www.speakev.com/threads/thoughts-on-th...   
 4  2023-09-26  https://www.speakev.com/threads/fiat-500e-char...   
 
                                                 Text  
 0  hi there we will receive our new fiat 500e ico...  
 1  hi there i have ordered a fiat 500e icon on le...  
 2  i had my 2015 fiat 500e shipped from ca a few ...  
 3  the latest fiat 500e la prima designio by kahn...  
 4  is there a way to delete not just unselect a c...  ,
          Date           ID                                               Text
 0  2024-05-04  0kDbvxpjLZs  9 seconds for an electric car thats especially...
 1  2024-02-16  0kDbvxpjLZs  good honest review thanks like the car im in a...
 2  2024-0

Merge the two dataframes:

In [6]:
# Define the source based on URL for forums
def get_forum_source(url):
    if 'speakev.com' in url:
        return 'SpeakEV'
    elif 'pistonheads.com' in url:
        return 'PistonHeads'
    else:
        return 'Unknown'

# Apply the source definition to the forums dataset
df_forums['Source'] = df_forums['URL'].apply(get_forum_source)

# For YouTube comments, the source is always 'YouTube Comment'
df_comments['Source'] = 'YouTube Comment'

# Select and rename columns to match the desired final dataset
df_forums = df_forums.rename(columns={'Date': 'Date', 'Text': 'Text'})
df_forums = df_forums[['Date', 'Source', 'Text']]

df_comments = df_comments.rename(columns={'Date': 'Date', 'Comment': 'Text'})
df_comments = df_comments[['Date', 'Source', 'Text']]

# Merge the two datasets
df_merge = pd.concat([df_forums, df_comments], ignore_index=True)

# Display the first few rows of the final dataframe
print(df_merge.head())


         Date   Source                                               Text
0  2023-07-21  SpeakEV  hi there we will receive our new fiat 500e ico...
1  2023-08-17  SpeakEV  hi there i have ordered a fiat 500e icon on le...
2  2022-10-22  SpeakEV  i had my 2015 fiat 500e shipped from ca a few ...
3  2023-09-12  SpeakEV  the latest fiat 500e la prima designio by kahn...
4  2023-09-26  SpeakEV  is there a way to delete not just unselect a c...


## Filter to Starting From March 2020

In [7]:
# Copy the final dataframe to a new variable
df_500 = df_merge.copy()
df_500.head()

Unnamed: 0,Date,Source,Text
0,2023-07-21,SpeakEV,hi there we will receive our new fiat 500e ico...
1,2023-08-17,SpeakEV,hi there i have ordered a fiat 500e icon on le...
2,2022-10-22,SpeakEV,i had my 2015 fiat 500e shipped from ca a few ...
3,2023-09-12,SpeakEV,the latest fiat 500e la prima designio by kahn...
4,2023-09-26,SpeakEV,is there a way to delete not just unselect a c...


In [8]:
# Check date range of the dataset
df_500['Date'] = pd.to_datetime(df_500['Date'])
print(df_500['Date'].min())
print(df_500['Date'].max())

2011-12-07 00:00:00
2024-07-13 00:00:00


In [9]:
# Filter the data to start from March 2020
df_500_filtered = df_500[df_500['Date'] >= '2020-03-01']
print(df_500_filtered['Date'].min())
print(df_500_filtered['Date'].max())

2020-03-14 00:00:00
2024-07-13 00:00:00


In [11]:
# Save as a new CSV file
df_500_filtered.to_csv('500e - Potential Customers.csv', index=False)