# ARP - Random Sampling for Categorisation of Datasets

In [15]:
import pandas as pd
import random
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk

# Ensure you have downloaded the VADER lexicon
nltk.download('vader_lexicon')
analyzer = SentimentIntensityAnalyzer()


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/Jey/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


# Fiat 600e

## 600e - YouTube Comments

In [16]:
# Load the dataset
file_path = '600e - Potential Customers.csv'
comments_df = pd.read_csv(file_path)

# Display the first few rows to understand the structure
print(comments_df.head())

         Date           ID                                               Text
0  2024-03-08  AXGNb76QgH0                              25000 is a fair price
1  2024-03-04  AXGNb76QgH0  it looks solid and i know its a great driving ...
2  2024-02-24  AXGNb76QgH0  really love the video and review of new fiat 6...
3  2024-02-16  AXGNb76QgH0  imagine if fiat invested in perfecting the 500...
4  2024-02-13  AXGNb76QgH0  my wife has a 500 c and all we love it nobody ...


In [17]:
comments_df.shape

(1148, 3)

In [18]:
# Step 2: Random Sampling
# Set sample size to 5 percent of the total number of comments
sample_size = int(0.05 * comments_df.shape[0])
print(f"Sample size: {sample_size}")
sampled_comments = comments_df.sample(n=sample_size, random_state=3)

Sample size: 57


In [19]:
sampled_comments

Unnamed: 0,Date,ID,Text
314,2023-09-23,VokQJrHzBR8,plus was there mention of charging speeds etc
845,2023-09-27,ejT6GwY6cSY,xxwookey 35 grand is not the entry fee at all ...
119,2024-04-09,MvgfB8kcS1s,bill_heywood so their balanced approach seems ...
518,2023-09-27,ejT6GwY6cSY,pro tip for the producers of the show please w...
740,2023-09-26,ejT6GwY6cSY,this is another disappointing ev it illustrate...
303,2023-09-24,VokQJrHzBR8,my favourite part of this review was when look...
1053,2023-09-26,ejT6GwY6cSY,much more interesting than the 500 doesnt look...
103,2024-05-03,MvgfB8kcS1s,stupid fiat taken out the interial mood lighti...
505,2023-09-28,ejT6GwY6cSY,stopped watching when you suggested that one p...
91,2023-10-10,AXGNb76QgH0,youre way behind the times i had a fiat for ye...


In [20]:
# Step 3: Refine Manual Review and Categorisation
def categorize_comment(Text):
    current_customer_keywords = [
    "my fiat", "i own", "i drive", "my car", "as an owner", "my 600e", "i have", "ive had", 
    "i bought", "i purchased", "im driving", "im using", "my vehicle",
    "driving my", "owned", "have owned", "currently own", "currently driving", 
    "i’ve driven", "my experience", "experiencing with", "have been using", "have been driving",
    "my ride", "in my fiat", "my journey with", "ownership of", "been driving", 
    "loving my", "bought my", "recently purchased", "enjoying my", "love driving"
   ]
    potential_customer_keywords = [
        "worth", "better", "if", "whether", "thinking", "think", "thought", "buying", "buy", 
        "considering", "consider", "considered", "looking to", "interested in", "planned", 
        "plan to", "planning to", "want", "wanted", "getting", "get", "available", "how much", 
        "prefer", "budget", "cost", "price", "than", "new car", "first car", "next car", 
        "decision", "decide", "decision", "deciding", "choose", "choice", "would love", 
        "purchase", "whats the", "what about", "is it", "does it", "when will", "test drive", 
        "lease", "financing", "finance options", "payment", "dealer", "dealership", 
        "excited about", "curious about", "researching", "research", "looking into", 
        "exploring", "versus", "vs", "as good as", "compared to", "alternative", "impressed", 
        "amazing", "fantastic", "great", "good", "review", "opinion", "feedback", "would you", 
        "recommend", "should i", "like it", "loving it", "love it","budget"
    ]

    for keyword in current_customer_keywords:
        if keyword in Text:
            return 'Current Customers'
    for keyword in potential_customer_keywords:
        if keyword in Text:
            return 'Potential Customers'
    return 'General Public'

# Apply categorisation
sampled_comments['Category'] = sampled_comments['Text'].apply(categorize_comment)

In [21]:
# Calculate Percentages
total_comments = len(sampled_comments)
percent_current = round((sampled_comments['Category'] == 'Current Customers').sum() / total_comments * 100, 2)
percent_potential = round((sampled_comments['Category'] == 'Potential Customers').sum() / total_comments * 100, 2)
percent_general = round((sampled_comments['Category'] == 'General Public').sum() / total_comments * 100, 2)

# Display the results
print(f"600e - YT Comments Percentage of Current Customers: {percent_current}%")
print(f"600e - YT Comments Percentage of Potential Customers: {percent_potential}%")
print(f"600e - YT Comments Percentage of General Public: {percent_general}%")

600e - YT Comments Percentage of Current Customers: 7.02%
600e - YT Comments Percentage of Potential Customers: 66.67%
600e - YT Comments Percentage of General Public: 26.32%


# Fiat 500e

## 500e - YouTube Comments

In [22]:
# Load the dataset
file_path = '500e - YouTube Comments_Clean.csv'
comments_df = pd.read_csv(file_path)

# Display the first few rows to understand the structure
print(comments_df.head())

         Date           ID                                               Text
0  2024-05-04  0kDbvxpjLZs  9 seconds for an electric car thats especially...
1  2024-02-16  0kDbvxpjLZs  good honest review thanks like the car im in a...
2  2024-01-06  0kDbvxpjLZs  this is a genius car this is a car that you do...
3  2023-10-21  0kDbvxpjLZs  got a twinair mito done 60k in it its great th...
4  2023-09-16  0kDbvxpjLZs  no spare wheel is a dealbreaker i love that ga...


In [23]:
comments_df.shape

(2540, 3)

In [24]:
# Step 2: Random Sampling
sample_size = int(0.05 * comments_df.shape[0])
print(f"Sample size: {sample_size}")
sampled_comments = comments_df.sample(n=sample_size, random_state=3)

Sample size: 127


In [25]:
sampled_comments

Unnamed: 0,Date,ID,Text
1875,2020-12-02,JSwvQE0tgyM,i have to say of all the small ev cars the fia...
617,2022-11-29,pMJ2fEhN-Vc,hubnut good luck with affordable heating this ...
1691,2020-12-02,JSwvQE0tgyM,ok sorry i typically do 5050 ev and piston stu...
2323,2024-06-03,OSvuSISisD8,but then again the mitsubishi mirage only has ...
2517,2024-06-03,OSvuSISisD8,they are going to sit on the lot for several m...
...,...,...,...
2199,2024-05-08,R-NFILq5hhA,i only drive 50 miles a month would love to ha...
1048,2021-02-03,JSwvQE0tgyM,is this a four seater car
838,2022-11-28,pMJ2fEhN-Vc,the timing does say it all they only show up t...
972,2023-02-18,JSwvQE0tgyM,whats the range of the 42kwh at constant 70mph


In [26]:
# Step 3: Refine Manual Review and Categorisation
def categorize_comment(Text):
    current_customer_keywords = [
    "my fiat", "i own", "i drive", "my car", "as an owner", "my 600e", "i have", "ive had", 
    "i bought", "i purchased", "im driving", "im using", "my vehicle",
    "driving my", "owned", "have owned", "currently own", "currently driving", 
    "i’ve driven", "my experience", "experiencing with", "have been using", "have been driving",
    "my ride", "in my fiat", "my journey with", "ownership of", "been driving", 
    "loving my", "bought my", "recently purchased", "enjoying my", "love driving"
   ]
    potential_customer_keywords = [
        "worth", "better", "if", "whether", "thinking", "think", "thought", "buying", "buy", 
        "considering", "consider", "considered", "looking to", "interested in", "planned", 
        "plan to", "planning to", "want", "wanted", "getting", "get", "available", "how much", 
        "prefer", "budget", "cost", "price", "than", "new car", "first car", "next car", 
        "decision", "decide", "decision", "deciding", "choose", "choice", "would love", 
        "purchase", "whats the", "what about", "is it", "does it", "when will", "test drive", 
        "lease", "financing", "finance options", "payment", "dealer", "dealership", 
        "excited about", "curious about", "researching", "research", "looking into", 
        "exploring", "versus", "vs", "as good as", "compared to", "alternative", "impressed", 
        "amazing", "fantastic", "great", "good", "review", "opinion", "feedback", "would you", 
        "recommend", "should i", "like it", "loving it", "love it","budget"
    ]

    for keyword in current_customer_keywords:
        if keyword in Text:
            return 'Current Customers'
    for keyword in potential_customer_keywords:
        if keyword in Text:
            return 'Potential Customers'
    return 'General Public'

# Apply categorisation
sampled_comments['Category'] = sampled_comments['Text'].apply(categorize_comment)

In [27]:
# Calculate Percentages
total_comments = len(sampled_comments)
percent_current = round((sampled_comments['Category'] == 'Current Customers').sum() / total_comments * 100, 2)
percent_potential = round((sampled_comments['Category'] == 'Potential Customers').sum() / total_comments * 100, 2)
percent_general = round((sampled_comments['Category'] == 'General Public').sum() / total_comments * 100, 2)

# Display the results
print(f"500e - YT Comments Percentage of Current Customers: {percent_current}%")
print(f"500e - YT Comments Percentage of Potential Customers: {percent_potential}%")
print(f"500e - YT Comments Percentage of General Public: {percent_general}%")

500e - YT Comments Percentage of Current Customers: 7.09%
500e - YT Comments Percentage of Potential Customers: 71.65%
500e - YT Comments Percentage of General Public: 21.26%


## 500e - Forums

In [28]:
forum_speakev = pd.read_csv('500e - SpeakEV_Clean.csv')
forum_ph = pd.read_csv('500e - PistonHeads_Clean.csv')

# Merge the two dataframes
forums_df = pd.concat([forum_speakev, forum_ph], ignore_index=True)

# Keep only data after March 2020
forums_df['Date'] = pd.to_datetime(forums_df['Date'])
forums_df = forums_df[forums_df['Date'] >= '2020-03-01']

In [29]:
# Display the first few rows to understand the structure
print(forums_df.head())

        Date                                                URL  \
0 2023-07-21  https://www.speakev.com/threads/fiat-500e-icon...   
1 2023-08-17  https://www.speakev.com/threads/what-charging-...   
2 2022-10-22  https://www.speakev.com/threads/fiat-500e-serv...   
3 2023-09-12  https://www.speakev.com/threads/thoughts-on-th...   
4 2023-09-26  https://www.speakev.com/threads/fiat-500e-char...   

                                                Text  
0  hi there we will receive our new fiat 500e ico...  
1  hi there i have ordered a fiat 500e icon on le...  
2  i had my 2015 fiat 500e shipped from ca a few ...  
3  the latest fiat 500e la prima designio by kahn...  
4  is there a way to delete not just unselect a c...  


In [30]:
forums_df.shape

(246, 3)

In [31]:
# Step 2: Random Sampling
sample_size = int(0.05 * forums_df.shape[0])
print(f"Sample size: {sample_size}")
sampled_forums = forums_df.sample(n=sample_size, random_state=5)

Sample size: 12


In [32]:
sampled_forums

Unnamed: 0,Date,URL,Text
60,2022-05-08,https://www.speakev.com/threads/no-ev-made-it-...,as predicted or not tesla has disappeared from...
57,2022-08-22,https://www.speakev.com/threads/hello-from-lux...,moien im änder from luxembourg aged 44 and las...
180,2022-08-25,https://www.speakev.com/threads/smart-fortwo-e...,i am close to making the decision to get my fi...
833,2023-08-21,https://www.pistonheads.com/gassing/topic.asp?...,be serious weight is irrelevant because it has...
52,2023-12-07,https://www.speakev.com/threads/are-dumb-home-...,hi all i have had my nissan leaf since 2016 ty...
698,2022-02-01,https://www.pistonheads.com/gassing/topic.asp?...,ive got one had it since nov ours is very much...
900,2021-12-31,https://www.pistonheads.com/gassing/topic.asp?...,had one 5 mths and sold it as an experienced e...
64,2021-11-27,https://www.speakev.com/threads/miles-per-kwh-...,dort had 500e 3 months now and energy in is wo...
736,2020-07-21,https://www.pistonheads.com/gassing/topic.asp?...,we had an i3 a few years ago and she loved it ...
85,2021-05-20,https://www.speakev.com/threads/sold-zoe-22kwh...,hi having joined the ev revolution in 2016 wit...


In [33]:
# Step 3: Refine Manual Review and Categorisation
def categorize_comment(Text):
    current_customer_keywords = [
    "my fiat", "i own", "i drive", "my car", "as an owner", "my 600e", "i have", "ive had", 
    "i bought", "i purchased", "im driving", "im using", "my vehicle",
    "driving my", "owned", "have owned", "currently own", "currently driving", 
    "i’ve driven", "my experience", "experiencing with", "have been using", "have been driving",
    "my ride", "in my fiat", "my journey with", "ownership of", "been driving", 
    "loving my", "bought my", "recently purchased", "enjoying my", "love driving"
   ]
    potential_customer_keywords = [
        "worth", "better", "if", "whether", "thinking", "think", "thought", "buying", "buy", 
        "considering", "consider", "considered", "looking to", "interested in", "planned", 
        "plan to", "planning to", "want", "wanted", "getting", "get", "available", "how much", 
        "prefer", "budget", "cost", "price", "than", "new car", "first car", "next car", 
        "decision", "decide", "decision", "deciding", "choose", "choice", "would love", 
        "purchase", "whats the", "what about", "is it", "does it", "when will", "test drive", 
        "lease", "financing", "finance options", "payment", "dealer", "dealership", 
        "excited about", "curious about", "researching", "research", "looking into", 
        "exploring", "versus", "vs", "as good as", "compared to", "alternative", "impressed", 
        "amazing", "fantastic", "great", "good", "review", "opinion", "feedback", "would you", 
        "recommend", "should i", "like it", "loving it", "love it","budget"
    ]

    for keyword in current_customer_keywords:
        if keyword in Text:
            return 'Current Customers'
    for keyword in potential_customer_keywords:
        if keyword in Text:
            return 'Potential Customers'
    return 'General Public'

# Apply categorisation
sampled_forums['Category'] = sampled_forums['Text'].apply(categorize_comment)

In [34]:
# Calculate Percentages
total_forums = len(sampled_forums)
percent_current = round((sampled_forums['Category'] == 'Current Customers').sum() / total_forums * 100, 2)
percent_potential = round((sampled_forums['Category'] == 'Potential Customers').sum() / total_forums * 100, 2)
percent_general = round((sampled_forums['Category'] == 'General Public').sum() / total_forums * 100, 2)

# Display the results
print(f"500e - Forums Percentage of Current Customers: {percent_current}%")
print(f"500e - Forums Percentage of Potential Customers: {percent_potential}%")
print(f"500e - Forums Percentage of General Public: {percent_general}%")

500e - Forums Percentage of Current Customers: 33.33%
500e - Forums Percentage of Potential Customers: 58.33%
500e - Forums Percentage of General Public: 8.33%


## 500e - YouTube Comments + Forums (Merged Dataset)

In [35]:
# Load the dataset
file_path = '500e - Potential Customers.csv'
comments_forums_df = pd.read_csv(file_path)

# Display the first few rows to understand the structure
comments_forums_df.head()

Unnamed: 0,Date,Source,Text
0,2023-07-21,SpeakEV,hi there we will receive our new fiat 500e ico...
1,2023-08-17,SpeakEV,hi there i have ordered a fiat 500e icon on le...
2,2022-10-22,SpeakEV,i had my 2015 fiat 500e shipped from ca a few ...
3,2023-09-12,SpeakEV,the latest fiat 500e la prima designio by kahn...
4,2023-09-26,SpeakEV,is there a way to delete not just unselect a c...


In [36]:
comments_forums_df.shape

(2786, 3)

In [37]:
# Step 2: Random Sampling
sample_size = int(0.05 * comments_forums_df.shape[0])
print(f"Sample size: {sample_size}")
sampled_comments_forums = comments_forums_df.sample(n=sample_size, random_state=5)

Sample size: 139


In [38]:
sampled_comments_forums

Unnamed: 0,Date,Source,Text
1410,2020-12-05,YouTube Comment,this 500e is perfect for the week than the amg...
413,2021-06-03,YouTube Comment,nikki is a great presenter love the fiat too
982,2022-11-28,YouTube Comment,i could be attracted to an ev but why oh why a...
1688,2020-12-02,YouTube Comment,thelatebrakeshow no worries made me cackle thi...
1236,2022-01-21,YouTube Comment,wait till the italian made electrics start foo...
...,...,...,...
1275,2021-04-14,YouTube Comment,sherpa i think of a mountain climber like 4x4 ...
2609,2024-06-03,YouTube Comment,forest i have a challenge car for you a 2009 m...
1239,2021-12-13,YouTube Comment,please stop to compare with honda fiat is a fi...
1067,2022-11-28,YouTube Comment,my old renault scenic 06 did that if i was sto...


In [39]:
# Step 3: Refine Manual Review and Categorisation
def categorize_comment(Text):
    current_customer_keywords = [
    "my fiat", "i own", "i drive", "my car", "as an owner", "my 600e", "i have", "ive had", 
    "i bought", "i purchased", "im driving", "im using", "my vehicle",
    "driving my", "owned", "have owned", "currently own", "currently driving", 
    "i’ve driven", "my experience", "experiencing with", "have been using", "have been driving",
    "my ride", "in my fiat", "my journey with", "ownership of", "been driving", 
    "loving my", "bought my", "recently purchased", "enjoying my", "love driving"
   ]
    potential_customer_keywords = [
        "worth", "better", "if", "whether", "thinking", "think", "thought", "buying", "buy", 
        "considering", "consider", "considered", "looking to", "interested in", "planned", 
        "plan to", "planning to", "want", "wanted", "getting", "get", "available", "how much", 
        "prefer", "budget", "cost", "price", "than", "new car", "first car", "next car", 
        "decision", "decide", "decision", "deciding", "choose", "choice", "would love", 
        "purchase", "whats the", "what about", "is it", "does it", "when will", "test drive", 
        "lease", "financing", "finance options", "payment", "dealer", "dealership", 
        "excited about", "curious about", "researching", "research", "looking into", 
        "exploring", "versus", "vs", "as good as", "compared to", "alternative", "impressed", 
        "amazing", "fantastic", "great", "good", "review", "opinion", "feedback", "would you", 
        "recommend", "should i", "like it", "loving it", "love it","budget"
    ]

    for keyword in current_customer_keywords:
        if keyword in Text:
            return 'Current Customers'
    for keyword in potential_customer_keywords:
        if keyword in Text:
            return 'Potential Customers'
    return 'General Public'

# Apply categorisation
sampled_comments_forums['Category'] = sampled_comments_forums['Text'].apply(categorize_comment)


In [40]:
# Calculate Percentages
total_comments_forums = len(sampled_comments_forums)
percent_current = round((sampled_comments_forums['Category'] == 'Current Customers').sum() / total_comments_forums * 100, 2)
percent_potential = round((sampled_comments_forums['Category'] == 'Potential Customers').sum() / total_comments_forums * 100, 2)
percent_general = round((sampled_comments_forums['Category'] == 'General Public').sum() / total_comments_forums * 100, 2)

# Display the results
print(f"500e - Merged YT Comments + Forums Percentage of Current Customers: {percent_current}%")
print(f"500e - Merged YT Comments + Forums  Percentage of Potential Customers: {percent_potential}%")
print(f"500e - Merged YT Comments + Forums  Percentage of General Public: {percent_general}%")

500e - Merged YT Comments + Forums Percentage of Current Customers: 10.79%
500e - Merged YT Comments + Forums  Percentage of Potential Customers: 63.31%
500e - Merged YT Comments + Forums  Percentage of General Public: 25.9%
