# ARP - Data Preprocessing of YouTube Comments

In [1]:
import pandas as pd
import numpy as np
import re

# 600e - Cleaning Comments

In [2]:
# Read csv file
df_600 = pd.read_csv('DataPull-600e.csv')
df_600.head(15)

Unnamed: 0,Timestamp,Username,VideoID,Comment,Date
0,2024-05-01T13:34:19Z,@gm2543,AXGNb76QgH0,Ehy do you miss out the 500X its same and hybr...,2024-05-01T13:34:19Z
1,2024-04-28T10:14:55Z,@mohammaddavoudian7897,AXGNb76QgH0,"Enjoyed watching, thank you.",2024-04-28T10:14:55Z
2,2024-04-06T19:20:29Z,@StrandgaZt,AXGNb76QgH0,Nobody are liking the eye lits so far.,2024-04-06T19:20:29Z
3,2024-03-16T23:54:26Z,@christopher9727,AXGNb76QgH0,Don't follow the worldly trends follow Jesus C...,2024-03-16T23:54:26Z
4,2024-03-08T17:15:08Z,@rm783,AXGNb76QgH0,25000 is a fair price,2024-03-08T17:15:08Z
5,2024-03-04T13:20:15Z,@dusandestanovic5884,AXGNb76QgH0,It looks solid and I know it's a great driving...,2024-03-04T13:20:15Z
6,2024-02-24T20:26:58Z,@DailyDrivenDaily,AXGNb76QgH0,Really love the video and review of new Fiat 6...,2024-02-24T20:26:58Z
7,2024-02-16T14:32:43Z,@User-pu3lc,AXGNb76QgH0,Imagine if Fiat invested in perfecting the 500...,2024-02-16T14:32:57Z
8,2024-02-15T11:52:27Z,@Mctigel,AXGNb76QgH0,OMG i hate you shoes.They look like someting o...,2024-02-15T11:52:27Z
9,2024-02-13T13:47:24Z,@szabolcs8349,AXGNb76QgH0,My wife has a 500 C and all we love it. Nobody...,2024-02-13T13:47:24Z


In [3]:
# Drop column 'Username' and 'Date'
df_600 = df_600.drop(columns=['Username', 'Date'])
df_600.head(15)

Unnamed: 0,Timestamp,VideoID,Comment
0,2024-05-01T13:34:19Z,AXGNb76QgH0,Ehy do you miss out the 500X its same and hybr...
1,2024-04-28T10:14:55Z,AXGNb76QgH0,"Enjoyed watching, thank you."
2,2024-04-06T19:20:29Z,AXGNb76QgH0,Nobody are liking the eye lits so far.
3,2024-03-16T23:54:26Z,AXGNb76QgH0,Don't follow the worldly trends follow Jesus C...
4,2024-03-08T17:15:08Z,AXGNb76QgH0,25000 is a fair price
5,2024-03-04T13:20:15Z,AXGNb76QgH0,It looks solid and I know it's a great driving...
6,2024-02-24T20:26:58Z,AXGNb76QgH0,Really love the video and review of new Fiat 6...
7,2024-02-16T14:32:43Z,AXGNb76QgH0,Imagine if Fiat invested in perfecting the 500...
8,2024-02-15T11:52:27Z,AXGNb76QgH0,OMG i hate you shoes.They look like someting o...
9,2024-02-13T13:47:24Z,AXGNb76QgH0,My wife has a 500 C and all we love it. Nobody...


Adjust to lowercase:

In [4]:
# Adjust 'Comment' column to lowercase
df_600['Comment'] = df_600['Comment'].str.lower()

Remove punctuation only:

In [5]:
# Remove punctuation, but keep emojis
def remove_punctuation(text):
    return re.sub(r'[^\w\s\U0001F600-\U0001F64F]', '', text)

# Apply the function to the 'Comment' column
df_600['Comment'] = df_600['Comment'].apply(remove_punctuation)

Adjust timestamp:

In [6]:
# Split the 'Timestamp' into 'Date' and 'Time' and keep only the 'Date'
df_600['Date'] = df_600['Timestamp'].str.split('T', expand=True)[0]

# Move 'Date' to the first column
cols = ['Date'] + [col for col in df_600.columns if col != 'Date']
df_600 = df_600[cols]

# Drop the 'Timestamp' 
df_600.drop('Timestamp', axis=1, inplace=True)

In [7]:
# Remove rows where 'Content' is empty or has 2 words or less
df_600 = df_600[df_600['Comment'].apply(lambda x: isinstance(x, str) and len(x.split()) > 2 and x.strip() != '')]

# Remove leading and trailing whitespace and excessive internal whitespace
df_600['Comment'] = df_600['Comment'].str.strip()  # Remove leading and trailing whitespace
df_600['Comment'] = df_600['Comment'].apply(lambda x: re.sub(r'\s+', ' ', x))  # Remove excessive internal whitespace

In [8]:
# Preview the cleaned data
df_600.head(15)

Unnamed: 0,Date,VideoID,Comment
0,2024-05-01,AXGNb76QgH0,ehy do you miss out the 500x its same and hybr...
1,2024-04-28,AXGNb76QgH0,enjoyed watching thank you
2,2024-04-06,AXGNb76QgH0,nobody are liking the eye lits so far
3,2024-03-16,AXGNb76QgH0,dont follow the worldly trends follow jesus ch...
4,2024-03-08,AXGNb76QgH0,25000 is a fair price
5,2024-03-04,AXGNb76QgH0,it looks solid and i know its a great driving ...
6,2024-02-24,AXGNb76QgH0,really love the video and review of new fiat 6...
7,2024-02-16,AXGNb76QgH0,imagine if fiat invested in perfecting the 500...
8,2024-02-15,AXGNb76QgH0,omg i hate you shoesthey look like someting ou...
9,2024-02-13,AXGNb76QgH0,my wife has a 500 c and all we love it nobody ...


# 600e - Filtering Comments 

In [9]:
# Define keywords related to the car
car_keywords = ['fiat', '600e', 'electric', 'vehicle', 'ev', 'evs', 'car', 'cars', 
                'uk',  'price', 'deal', 'purchase', 'grant', 'grants', 'egrant', 'e grant',
                'battery', 'kwh', 'charge', 'charging', 'charger', 'power', 'electricity',
                'engine', 'speed', 'range', 'performance', 'time', 'mile', 'miles', 
                'seat', 'seats', 'brake', 'pedal', 'window', 'windows', 'drive', 'driving',
                'boot', 'bootspace', 'boot space', 'trunk',
                'colour', 'design', 'dashboard', 'infotainment', 'interior', 'exterior', 
                'modern', 'comfortable', 'efficient', 'feature', 'feature', 'urban', 'city',
                'safety', 'family', 'first']


# Function to check if any keyword is in the comment
def is_related_to_car(comment):
    if pd.isna(comment):
        return False
    return any(keyword in comment for keyword in car_keywords)

# Filter the DataFrame
df_600 = df_600[df_600['Comment'].apply(is_related_to_car)]

# Drop the first row of data 
df_600 = df_600.iloc[1:]
df_600.head()

Unnamed: 0,Date,VideoID,Comment
4,2024-03-08,AXGNb76QgH0,25000 is a fair price
5,2024-03-04,AXGNb76QgH0,it looks solid and i know its a great driving ...
6,2024-02-24,AXGNb76QgH0,really love the video and review of new fiat 6...
7,2024-02-16,AXGNb76QgH0,imagine if fiat invested in perfecting the 500...
9,2024-02-13,AXGNb76QgH0,my wife has a 500 c and all we love it nobody ...


In [10]:
# Save as csv file 
df_600.to_csv('600e - Potential Customers.csv', index=False)

In [11]:
df_600.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1148 entries, 4 to 1654
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Date     1148 non-null   object
 1   VideoID  1148 non-null   object
 2   Comment  1148 non-null   object
dtypes: object(3)
memory usage: 35.9+ KB


# 500e - Cleaning Comments

In [12]:
# Read csv file
df_500e = pd.read_csv('DataPull-500e.csv', engine='python', escapechar="\\")
df_500e.head(10) 

Unnamed: 0,Timestamp,Username,VideoID,Comment,Date
0,2024-05-04T20:35:00Z,@miscetc-tm2yt,0kDbvxpjLZs,"9 seconds, for an electric car that's especial...",2024-05-04T20:35:00Z
1,2024-02-16T11:44:20Z,@LucuYT,0kDbvxpjLZs,Good honest review thanks. Like the car I’m...,2024-02-16T11:44:20Z
2,2024-02-13T22:37:50Z,@Lewis_mp3,0kDbvxpjLZs,This women is awfully irritating,2024-02-13T22:37:50Z
3,2024-02-13T22:21:00Z,@mitchib1440,0kDbvxpjLZs,This presenter is absolutely amazing! More of ...,2024-02-13T22:21:00Z
4,2024-01-06T11:27:45Z,@IMAN7THRYLOS,0kDbvxpjLZs,This is a genius car. This is a car that you d...,2024-01-06T11:27:45Z
5,2023-10-21T11:33:22Z,@musiceditor7083,0kDbvxpjLZs,Got a Twinair Mito - done 60k in it & it's gre...,2023-10-21T11:33:22Z
6,2023-09-16T04:33:48Z,@cmmartti,0kDbvxpjLZs,No spare wheel is a dealbreaker. I love that g...,2023-09-16T04:33:47Z
7,2023-08-17T22:03:40Z,@astra-rb6sz,0kDbvxpjLZs,I love this car,2023-08-17T22:03:40Z
8,2023-06-01T06:25:30Z,@BF4pawntard,0kDbvxpjLZs,I absolutely hate those big entertainment scre...,2023-06-01T06:25:30Z
9,2023-04-28T17:32:15Z,@o2barebel,0kDbvxpjLZs,Why are you driving in the middle of the road?,2023-04-28T17:32:15Z


In [13]:
# Drop column 'Username' and 'Date'
df_500e = df_500e.drop(columns=['Username', 'Date'])
df_500e.head(10)

Unnamed: 0,Timestamp,VideoID,Comment
0,2024-05-04T20:35:00Z,0kDbvxpjLZs,"9 seconds, for an electric car that's especial..."
1,2024-02-16T11:44:20Z,0kDbvxpjLZs,Good honest review thanks. Like the car I’m...
2,2024-02-13T22:37:50Z,0kDbvxpjLZs,This women is awfully irritating
3,2024-02-13T22:21:00Z,0kDbvxpjLZs,This presenter is absolutely amazing! More of ...
4,2024-01-06T11:27:45Z,0kDbvxpjLZs,This is a genius car. This is a car that you d...
5,2023-10-21T11:33:22Z,0kDbvxpjLZs,Got a Twinair Mito - done 60k in it & it's gre...
6,2023-09-16T04:33:48Z,0kDbvxpjLZs,No spare wheel is a dealbreaker. I love that g...
7,2023-08-17T22:03:40Z,0kDbvxpjLZs,I love this car
8,2023-06-01T06:25:30Z,0kDbvxpjLZs,I absolutely hate those big entertainment scre...
9,2023-04-28T17:32:15Z,0kDbvxpjLZs,Why are you driving in the middle of the road?


In [14]:
# Fill None values in 'Comment' column with empty string to avoid errors
df_500e['Comment'] = df_500e['Comment'].fillna('')

In [15]:
# Adjust 'Comment' column to lowercase
df_500e['Comment'] = df_500e['Comment'].str.lower()

In [16]:
# Remove punctuation
def remove_punctuation(text):
    return re.sub(r'[^\w\s\U0001F600-\U0001F64F]', '', text)

# Apply the function to the 'Comment' column
df_500e['Comment'] = df_500e['Comment'].apply(remove_punctuation)

In [17]:
# Split the 'Timestamp' into 'Date' and 'Time' and keep only the 'Date'
df_500e['Date'] = df_500e['Timestamp'].str.split('T', expand=True)[0]

# Move 'Date' to the first column
cols = ['Date'] + [col for col in df_500e.columns if col != 'Date']
df_500e = df_500e[cols]

# Drop the 'Timestamp' 
df_500e.drop('Timestamp', axis=1, inplace=True)

In [18]:
# Remove rows where 'Content' is empty or has 2 words or less
df_500e = df_500e[df_500e['Comment'].apply(lambda x: isinstance(x, str) and len(x.split()) > 2 and x.strip() != '')]

# Remove leading and trailing whitespace and excessive internal whitespace
df_500e['Comment'] = df_500e['Comment'].str.strip()  # Remove leading and trailing whitespace
df_500e['Comment'] = df_500e['Comment'].apply(lambda x: re.sub(r'\s+', ' ', x))  # Remove excessive internal whitespace

In [19]:
# Preview the cleaned data
df_500e.head(15)

Unnamed: 0,Date,VideoID,Comment
0,2024-05-04,0kDbvxpjLZs,9 seconds for an electric car thats especially...
1,2024-02-16,0kDbvxpjLZs,good honest review thanks like the car im in a...
2,2024-02-13,0kDbvxpjLZs,this women is awfully irritating
3,2024-02-13,0kDbvxpjLZs,this presenter is absolutely amazing more of h...
4,2024-01-06,0kDbvxpjLZs,this is a genius car this is a car that you do...
5,2023-10-21,0kDbvxpjLZs,got a twinair mito done 60k in it its great th...
6,2023-09-16,0kDbvxpjLZs,no spare wheel is a dealbreaker i love that ga...
7,2023-08-17,0kDbvxpjLZs,i love this car
8,2023-06-01,0kDbvxpjLZs,i absolutely hate those big entertainment scre...
9,2023-04-28,0kDbvxpjLZs,why are you driving in the middle of the road


# 500e - Filtering Comments

In [20]:
# Define keywords related to the car
car_keywords = ['fiat', '500e', 'electric', 'vehicle', 'ev', 'evs', 'car', 'cars', 
                'uk',  'price', 'deal', 'purchase', 'grant', 'grants', 'egrant', 'e grant',
                'battery', 'kwh', 'charge', 'charging', 'charger', 'power', 'electricity',
                'engine', 'speed', 'range', 'performance', 'time', 'mile', 'miles', 
                'seat', 'seats', 'brake', 'pedal', 'window', 'windows', 'drive', 'driving',
                'boot', 'bootspace', 'boot space', 'trunk',
                'colour', 'design', 'dashboard', 'infotainment', 'interior', 'exterior', 
                'modern', 'comfortable', 'efficient', 'feature', 'feature', 'urban', 'city',
                'safety', 'family', 'first']

# Function to check if any keyword is in the comment
def is_related_to_car(comment):
    if pd.isna(comment):
        return False
    return any(keyword in comment for keyword in car_keywords)

# Filter the DataFrame
df_500e = df_500e[df_500e['Comment'].apply(is_related_to_car)]
df_500e.head()

Unnamed: 0,Date,VideoID,Comment
0,2024-05-04,0kDbvxpjLZs,9 seconds for an electric car thats especially...
1,2024-02-16,0kDbvxpjLZs,good honest review thanks like the car im in a...
4,2024-01-06,0kDbvxpjLZs,this is a genius car this is a car that you do...
5,2023-10-21,0kDbvxpjLZs,got a twinair mito done 60k in it its great th...
6,2023-09-16,0kDbvxpjLZs,no spare wheel is a dealbreaker i love that ga...


In [21]:
# Save as csv file 
df_500e.to_csv('500e - YouTube Comments_Clean.csv', index=False)

In [22]:
df_500e.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2540 entries, 0 to 3826
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Date     2540 non-null   object
 1   VideoID  2540 non-null   object
 2   Comment  2540 non-null   object
dtypes: object(3)
memory usage: 79.4+ KB
