In [33]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [34]:
!apt-get install -y libhunspell-dev
!pip install hunspell

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
libhunspell-dev is already the newest version (1.7.0-4build1).
0 upgraded, 0 newly installed, 0 to remove and 0 not upgraded.


In [35]:
!pip install nltk



In [36]:
# Imports

# General
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# Preprocessing
import unicodedata
from bs4 import BeautifulSoup
import spacy
import hunspell
from nltk.corpus import words as nltk_words
import pandas as pd

In [63]:
# Load the train set
train = pd.read_csv('./train.csv', sep=';')

In [38]:
train.sample(10, random_state = 42)

Unnamed: 0,rating,combined_text
9030,4,No bad for the price Not bright enough.
12968,1,Do not recommend Quality control on this product is not very good. First one I received had a cracked viewing window. Got a replacement and it was also cracked. Spend the money elsewhere
3817,4,Lasting Comfortable
11009,4,Heavy A little extra heavy for my use
1493,1,Not great Does not get stains out any better than a basic stain remover. Using it for cleaning grease in the kitchen is intolerable because the smell gives me a horrible headache.
9001,2,Not anymore Price has more than doubled.<br />Stickers are cut so they are extremely hard to use.<br />Side cuts on boxes are not as aligned as they used to be.
7796,1,DON'T GET IT ! I DID NOT LIKE IT IT SUCKS DONT<br />GET IT DON'T WASTE ORDERING<br />IT !
10480,5,Love it Exceed my expectations Great quality Love it<br />Exceed my expectations<br />Great quality
13188,1,"Not as described Description says: “ZMAX galvanization offers extra corrosion resistance for exterior and treated-wood applications”. NOT!<br />The ties I received were not galvanized, bare metal."
2900,4,Metal hook and loop? These will get the job done. Average quality - as expected.


### Normalize

In [39]:
def normalize(df, column_name):

  '''Concert to lowercase, remove HTML tags, and normalize to ASCII
  (remove accents, special characters, etc.)'''

  normalized_text = df[column_name].str.lower()

  normalized_text = normalized_text.apply(lambda x:
      BeautifulSoup(str(x), "html.parser").get_text())

  normalized_text = normalized_text.apply(lambda x:
      unicodedata.normalize('NFKD', x).encode('ascii', 'ignore').decode('utf-8', 'ignore'))

  return normalized_text

In [40]:
train['normalized_text'] = normalize(train,'combined_text')
train.sample(5, random_state = 42)

  BeautifulSoup(str(x), "html.parser").get_text())


Unnamed: 0,rating,combined_text,normalized_text
9030,4,No bad for the price Not bright enough.,no bad for the price not bright enough.
12968,1,Do not recommend Quality control on this product is not very good. First one I received had a cracked viewing window. Got a replacement and it was also cracked. Spend the money elsewhere,do not recommend quality control on this product is not very good. first one i received had a cracked viewing window. got a replacement and it was also cracked. spend the money elsewhere
3817,4,Lasting Comfortable,lasting comfortable
11009,4,Heavy A little extra heavy for my use,heavy a little extra heavy for my use
1493,1,Not great Does not get stains out any better than a basic stain remover. Using it for cleaning grease in the kitchen is intolerable because the smell gives me a horrible headache.,not great does not get stains out any better than a basic stain remover. using it for cleaning grease in the kitchen is intolerable because the smell gives me a horrible headache.


In [41]:
# Check if any row in the 'normalized_text' column contains '<br />'
print(train['normalized_text'].str.contains('<br />', regex=False).any())

False


### Correct spelling mistakes

<b>Comment: </b>Before applying spelling corrections wihout a criterion (which could lead to a risk of altering valid text), samples with potential spelling errors will be identified using a dictionary-based validation. A dictionary-based validation will check if words in the text exist in the NLTK dictionary.

Later, Hunspell, a spell-checking library, will be applied to correct rows with potential spelling errors.

In [42]:
import nltk
nltk.download('words')

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [43]:
# Load English words
valid_words = set(nltk_words.words())

In [44]:
# Function to detect potential errors
def detect_errors(text):
    """Check if text has spelling issues based on dictionary validation."""
    if not isinstance(text, str):
        return False
    tokens = text.split()
    invalid_ratio = sum(1 for word in tokens if word not in valid_words) / len(tokens)
    return invalid_ratio > 0.2

In [45]:
# Detect errors
train["likely_problematic"] = train["normalized_text"].apply(detect_errors)

In [46]:
# Show sample rows where there are likely spelling errors
problematic_samples = train[train["likely_problematic"]]
pd.set_option('display.max_colwidth', None)
problematic_samples.sample(10)

Unnamed: 0,rating,combined_text,normalized_text,likely_problematic
8238,2,"Case is garbage!!!!!! The case it came in was broke on the heaviest weight, the 500g weight.","case is garbage!!!!!! the case it came in was broke on the heaviest weight, the 500g weight.",True
2803,4,"Ok but narrow hook. They seem sturdy and hook well over doors, plastic and don't scratch the door but the hanging hook side is too narrow, barely fits a finger, making it not very practical for hanging clothes, pants, jackets. Good for small towels, purses, ties. Limiting their use. Wish were wider opening.","ok but narrow hook. they seem sturdy and hook well over doors, plastic and don't scratch the door but the hanging hook side is too narrow, barely fits a finger, making it not very practical for hanging clothes, pants, jackets. good for small towels, purses, ties. limiting their use. wish were wider opening.",True
2878,3,Always concerning The product looks and reads nice. But showing a print impossible without massive support is concerning.,always concerning the product looks and reads nice. but showing a print impossible without massive support is concerning.,True
9395,3,Poor suction Ok for hardwood or laminate floors. Not so good on rugs or carpet. Really poor suction.,poor suction ok for hardwood or laminate floors. not so good on rugs or carpet. really poor suction.,True
10272,4,Great product. They do not supply any assembly drawings ... Great product. They do not supply any assembly drawings and no software. You need to look and study the product pictures and pictures other people post to assemble this printer.,great product. they do not supply any assembly drawings ... great product. they do not supply any assembly drawings and no software. you need to look and study the product pictures and pictures other people post to assemble this printer.,True
6650,5,"Fixed my hvac ac Was told I needed a new compressor. Bought this and it fixed the ac, no compressor needed and saved a lot of money.","fixed my hvac ac was told i needed a new compressor. bought this and it fixed the ac, no compressor needed and saved a lot of money.",True
12420,5,"Lightweight Powerhouse Better than expected, perfect for stairs, blinds and hard to reach places. Very satisfied, thinking about purchasing another for home use.","lightweight powerhouse better than expected, perfect for stairs, blinds and hard to reach places. very satisfied, thinking about purchasing another for home use.",True
7018,2,Cheap and breaks quick Broke after using for a few months,cheap and breaks quick broke after using for a few months,True
3535,2,"Insulating material peeling away / ripped inside bag Insulating material inside both bags ripped at the upper seams of the bags and this renders both bags useless, since heat escapes through the fabric of the bags. I got about 3 months of light use if these bags and they aren't durable for the price.","insulating material peeling away / ripped inside bag insulating material inside both bags ripped at the upper seams of the bags and this renders both bags useless, since heat escapes through the fabric of the bags. i got about 3 months of light use if these bags and they aren't durable for the price.",True
3390,3,"Difficult to print with I have a had a lot of difficulty getting good prints out of the spring of 2020 batch of this PLA. In particular, my standard PLA profiles don't work, I need to raise the bed temperature up to 70C and increase overall temperatures before I start getting good results. Once you get it working, it prints fine and has good color and surface quality; I have gotten lots of good prints out of the one spool I've bought. But there's no reason that YOU should put yourself through the frustration, just buy some other brand.","difficult to print with i have a had a lot of difficulty getting good prints out of the spring of 2020 batch of this pla. in particular, my standard pla profiles don't work, i need to raise the bed temperature up to 70c and increase overall temperatures before i start getting good results. once you get it working, it prints fine and has good color and surface quality; i have gotten lots of good prints out of the one spool i've bought. but there's no reason that you should put yourself through the frustration, just buy some other brand.",True


<b>Comment: </b> Unfortunately,an NLTK vocabulary was not enough to filter rows with spelling mistakes, as the reviews contain many product and domain specific words.
A way forward could be to include a domain-specific vocabulary in the NLTK vocabulary. Or to target specific patterns in text that may indicate potential spelling errors (such as repeated letters and standalone consonants) by using heuristics based on regular expressions.I will implement the second option and use Hunspell library to correct rows with likely suspicious patterns.

In [47]:
import re

# Function to detect repeated letters
def detect_repeated_letters(word):
    word = re.sub(r'[^a-zA-Z]', '', word)  # Remove non-alphabetical characters
    return bool(re.search(r'(.)\1{2,}', word))  # Matches repeated characters (e.g., 'ballll')

# Function to detect single consonants (standalone consonants)
def detect_standalone_consonants(word):
    return bool(re.match(r'^[bcdfghjklmnpqrstvwxyz]$', word))  # Matches single consonants like "b"

# Function to detect typographical errors using heuristics
def detect_typographical_errors(text):
    # Tokenize the text
    words = text.split()

    suspicious_words = []

    for word in words:

        # Check for repeated letters, missing vowels, or standalone consonants
        if detect_repeated_letters(word) or detect_standalone_consonants(word):
            suspicious_words.append(word)

    return suspicious_words

# Example function to apply to the DataFrame
def check_typographical_errors(df, column_name):
    # Apply the detect_typographical_errors function to each row of the specified column
    df['suspicious_words'] = df[column_name].apply(detect_typographical_errors)

    return df[['combined_text', 'suspicious_words']]


In [48]:
# Check for typographical errors in 'normalized_text'
suspicious_words_df = check_typographical_errors(train, 'normalized_text')
suspicious_rows = suspicious_words_df[suspicious_words_df['suspicious_words'].apply(lambda x: len(x) > 0)]

In [49]:
pd.reset_option('display.max_colwidth')
suspicious_rows.sample(8, random_state = 42)

Unnamed: 0,combined_text,suspicious_words
343,Not a double V belt My rating is not based on ...,"[v, v]"
6371,"Works well, buts it’s no CR-10s. EDIT/UPDATE: ...",[x]
5284,Good Meter with Lots of Features One of my fav...,[aaa]
1600,Does not stick to smooth plastic surfaces I li...,"[sooo, sooo]"
3382,Hard to tell... This was very easy to install....,[t]
3742,Handy to have & easy to use! It' s handy to ha...,[s]
3134,It was not a Blue color The color was not a bl...,[s]
12795,ok... It was so hard to asssamble.,[asssamble.]


In [50]:
def correct_spelling_hunspell(text):
    """Correct spelling using Hunspell."""
    h = hunspell.HunSpell('en_US.dic', 'en_US.aff')
    words = text.split()
    corrected_words = []
    for word in words:
        if not h.spell(word):
            suggestions = h.suggest(word)
            corrected_words.append(suggestions[0] if suggestions else word)
        else:
            corrected_words.append(word)
    return ' '.join(corrected_words)

In [51]:
# Apply the correction to rows
train["corrected_text"] = train.apply(
    lambda row: correct_spelling_hunspell(row["normalized_text"]) if len(row["suspicious_words"]) > 0 else row["normalized_text"],
    axis=1,
)

In [52]:
# List of specific row indices from the above table with suspicious words
specific_indices = [343, 6371, 5284, 1600, 3382, 3742, 3134, 12795]

# Display rows that had suspicious words originally
suspicious_rows = train[train['suspicious_words'].apply(lambda x: len(x) > 0)]

# Print the rows with the specific indices
pd.set_option('display.max_colwidth', None)
suspicious_rows.loc[specific_indices, ['normalized_text', 'suspicious_words', 'corrected_text']]

Unnamed: 0,normalized_text,suspicious_words,corrected_text
343,"not a double v belt my rating is not based on the quality of the belt i received. by all means, it looked like a fine belt. the issue is that it is not a replacement for the simplicity belt by the same number. the belt on my mower is a double v and this belt was not.","[v, v]",not a double v belt my rating is not based on the quality of the belt i received. by all means it looked like a fine belt. the issue is that it is not a replacement for the simplicity belt by the same number. the belt on my mower is a double v and this belt was not.
6371,"works well, buts its no cr-10s. edit/update: so not long after i wrote this, it started acting up...hard. i retract many of my previous statements.1) i still cant shake the nozzle drag issue - ive re adjusted every e-nut (multiple times), changed nozzles, re adjusted the gantry to ensure they square/plumb. but...i cant shake it. i also have a friend with an lk4 - same issue. strange.2) had to replace the nozzle - clogged. also, after more digging found hot end wasnt bolted down well. whoever assembled it did a poor job.3) white screen of death. its a longer thing.4) discovered the belt on the x gantry is slightly too big, and is not designed well where its bolted down. the end result? the x-gantry belt rubs continuously on the extruder. it can be fixed, sure, but a pita, no doubt.im gonna pull this down to a two star printer and dont recommend it to the new crowd. itll give a lot of headaches if you end up with a lemon. youll end up having to make changes shortly after a few prints - it doesnt hold up.this is not my first printer- my ender still prints flawlessly without issue - for years. this has given me more headaches than i have time to deal with in less than a month.5/10.old review:this is a great, inexpensive printer with a large(r) print volume compared to an ender. worth the buy, but not as good as a cr-10.print quality: per dollar is very decent, although the nozzle drags harder than id like to see, and i cant seem to shake it. lets be clear, its not quite the quality of an ender...but im making an entire ironman costume cosplay on it and its holding up well.setup: setup is very quick. 20 mins max, including the leveling. touchscreen works fine so far.adhesion. ridiculously good. honesty the best ive seen. actually it can sometimes be quite hard to get it off the bed sometimes...but id much prefer this for the tall and wide prints i do.overall, 7/10. yes, worth the buy, but not perfect by any means.",[x],works well buts its no Cr-10s premeditated so not long after i wrote this it started acting up...hard. i retract many of my previous statement i still cant shake the nozzle drag issue e vie re adjusted every e-nut multiple timeless changed nozzles re adjusted the gantry to ensure they squareness but...i cant shake it. i also have a friend with an ilk e same issue. strangeness had to replace the nozzle e clogged. also after more digging found hot end wast bolted down well. whoever assembled it did a poor jobber white screen of death. its a longer thinning discovered the belt on the x gantry is slightly too big and is not designed well where its bolted down. the end result the x-gantry belt rubs continuously on the extruder. it can be fixed sure but a pita no doubting gonna pull this down to a two star printer and font recommend it to the new crowd. till give a lot of headaches if you end up with a lemon. you'll end up having to make changes shortly after a few prints e it doest hold uppish is not my first printer- my ender still prints flawlessly without issue e for years. this has given me more headaches than i have time to deal with in less than a month.5/10.old revisionist is a great inexpensive printer with a large print volume compared to an ender. worth the buy but not as good as a Cr-10.print quality per dollar is very decent although the nozzle drags harder than id like to see and i cant seem to shake it. lets be clear its not quite the quality of an bartender mi making an entire iron man costume co splay on it and its holding up wellspring setup is very quick. 20 muns max including the leveling. touchscreen works fine so adhesion ridiculously good. honesty the best vie seen. actually it can sometimes be quite hard to get it off the bed sometimes...but id much prefer this for the tall and wide prints i coverall 7/10. yes worth the buy but not perfect by any means.
5284,"good meter with lots of features one of my favorite features of his mayilon digital clamp meter is the display. i like the color display on the black background. i am also fond of the included accessories. it includes a nice rigid zippered case, nice flexible leads, temperature probe and 3 aaa batteries.this meter seems to be accurate as i compared it to a fluke meter and the measurements were pretty much equal. i can't go into great detail on all the functions of this meter as i only really use it for dc measurements, ohms and continuity. this meter has a built in led ""headlight"" too that i feel is more of a novelty. the light shines out the front into the center of the clamp.for my needs i rate this meter at 5 stars. accuracy and build quality are very good.",[aaa],good meter with lots of features one of my favorite features of his mailman digital clamp meter is the display. i like the color display on the black background. i am also fond of the included accessories. it includes a nice rigid zippered case nice flexible leads temperature probe and 3 AAA battleships meter seems to be accurate as i compared it to a fluke meter and the measurements were pretty much equal. i can't go into great detail on all the functions of this meter as i only really use it for DC measurements ohms and continuity. this meter has a built in led headlight too that i feel is more of a novelty. the light shines out the front into the center of the clamper my needs i rate this meter at 5 stars. accuracy and build quality are very good.
1600,"does not stick to smooth plastic surfaces i like that you get sooo many pieces for a low price.i like that they are grippy pieces.that's where the likes end.these seemed to be good quality. i put them all over my phone case. i had clear, more expensive grip stickers on my phone case for about a year but those were finally starting to get gross and the edges were coming up. in an effort to save a little money i decided to try these. they sounded like they were good quality and the pictures show them on curved objects. plus you get sooo many of them i figured i'd be set for years.i cleaned my phone case with alcohol and applied a bunch of these all over the flat and curved surface. it seemed to be working well and gave my phone case a fun pattern.then after only a couple days, they started to peel up. now it's been a couple weeks and i am constantly replacing them and trying to restick the edges throughout the day. it's driving me nuts but i've already used so many that i can't return them and i feel like i need to use them up otherwise it was a waste of money. but i am very close to just tossing them and buying the more expensive clear ones that i already know work.so disappointing.","[sooo, sooo]",does not stick to smooth plastic surfaces i like that you get sou many pieces for a low price like that they are grippe hairpiece's where the likes parentheses seemed to be good quality. i put them all over my phone case. i had clear more expensive grip stickers on my phone case for about a year but those were finally starting to get gross and the edges were coming up. in an effort to save a little money i decided to try these. they sounded like they were good quality and the pictures show them on curved objects. plus you get sou many of them i figured I'd be set for yearning cleaned my phone case with alcohol and applied a bunch of these all over the flat and curved surface. it seemed to be working well and gave my phone case a fun patternless after only a couple days they started to peel up. now it's been a couple weeks and i am constantly replacing them and trying to restock the edges throughout the day. it's driving me nuts but I've already used so many that i can't return them and i feel like i need to use them up otherwise it was a waste of money. but i am very close to just tossing them and buying the more expensive clear ones that i already know works disappointing.
3382,"hard to tell... this was very easy to install. i am no expert, but as long as you know your way around a screwdriver, you'll be just fine.now let's fast forward 2 months. one of the actuators is still holding up strong. very reliable. but the 2nd one wo t stay open. its showing a constant ""closed"" fault red light on the switch board. not to mention the motor is super hot to the touch. since 50% of my order works perfectly. while the other doesn't. i dont know how to rate these.",[t],hard to tell... this was very easy to install. i am no expert but as long as you know your way around a screwdriver you'll be just fineness let's fast forward 2 months. one of the actuators is still holding up strong. very reliable. but the 2nd one quo t stay open. its showing a constant closed fault red light on the switch board. not to mention the motor is super hot to the touch. since 50% of my order works perfectly. while the other doesn't. i font know how to rate these.
3742,handy to have & easy to use! it' s handy to have and easy to use for a multitude of projects!,[s],handy to have e easy to use it s handy to have and easy to use for a multitude of projects
3134,it was not a blue color the color was not a blue color but it did print on my qidi shadow 5.5 s printer with a nice print.,[s],it was not a blue color the color was not a blue color but it did print on my midi shadow 5.5 s printer with a nice print.
12795,ok... it was so hard to asssamble.,[asssamble.],OK it was so hard to assemble


<b>Comment: </b> Although the heuristics based on regex patterns identified many spelling errors, the Hunspell corrections were often wrong (e.g. "sou" for "soo", "it' s" for "it s").

The spelling correction thus won't be applied. I will follow directly with modeling and first see and evaluate the performance of the models.

In [53]:
train.drop(['suspicious_words', 'corrected_text', 'likely_problematic'], axis=1, inplace=True)

### Preprocess

In [54]:
def preprocess(df, column_name, custom_stopwords, nlp):

  '''Extract tokens with lemmatization, filter out stopwords and punctuation'''

  if custom_stopwords is None:
    custom_stopwords = []

  preprocessed_text = []
  for doc in nlp.pipe(df[column_name], batch_size=500, disable=["ner", "parser"]):
    processed_tokens = [
        token.lemma_ for token in doc
        if token.is_alpha
        and not token.is_punct
        and token.lemma_ not in custom_stopwords
    ]

    preprocessed_text.append(' '.join(processed_tokens))

  return preprocessed_text

In [55]:
# Load spacy language model
nlp = spacy.load('en_core_web_lg')

# Combine custom stopwords with spaCy's default stopwords
words_to_exclude = ['product', 'quality', 'money', 'price', 'time']
custom_stopwords = nlp.Defaults.stop_words.union(set(words_to_exclude ))

# Explicitly ensure 'not' and 'no' is NOT a stopword
custom_stopwords.discard('not')
custom_stopwords.discard('no')

In [56]:
train['preprocessed_text'] = preprocess(train,'normalized_text', custom_stopwords, nlp)
train.sample(5, random_state = 42)

Unnamed: 0,rating,combined_text,normalized_text,preprocessed_text
9030,4,No bad for the price Not bright enough.,no bad for the price not bright enough.,no bad not bright
12968,1,Do not recommend Quality control on this product is not very good. First one I received had a cracked viewing window. Got a replacement and it was also cracked. Spend the money elsewhere,do not recommend quality control on this product is not very good. first one i received had a cracked viewing window. got a replacement and it was also cracked. spend the money elsewhere,not recommend control not good I receive crack view window replacement crack spend
3817,4,Lasting Comfortable,lasting comfortable,comfortable
11009,4,Heavy A little extra heavy for my use,heavy a little extra heavy for my use,heavy little extra heavy use
1493,1,Not great Does not get stains out any better than a basic stain remover. Using it for cleaning grease in the kitchen is intolerable because the smell gives me a horrible headache.,not great does not get stains out any better than a basic stain remover. using it for cleaning grease in the kitchen is intolerable because the smell gives me a horrible headache.,not great not stain basic stain remover use clean grease kitchen intolerable smell I horrible headache


<b>Comment: </b>Judging by the first example in the table starting with "no bad", it might be important to retain also this stopword apart from "not" that was determined during EDA.

In [57]:
# Count total number of tokens in 'combined_text'
total_tokens = train['preprocessed_text'].str.lower().str.split().apply(len).sum()

print(f'Total number of tokens after preprocessing: {total_tokens}')

Total number of tokens after preprocessing: 298089


### Check word co-occurance of the stopword "no" before exclusion

In [58]:
# Check most frequent co-occurance of stopword "no"

# Transform text by countvectorizer
vectorizer = CountVectorizer(ngram_range=(2, 2), lowercase=True)
X_bigrams = vectorizer.fit_transform(train['preprocessed_text'])

# Extract feature names (bigrams) and sum up frequencies of each bigram
bigrams = vectorizer.get_feature_names_out()
bigram_counts = X_bigrams.sum(axis=0).A1
bigram_freq = pd.DataFrame({'bigram': bigrams, 'count': bigram_counts})

# Filter bigrams where the first token is "no" and sort by frequency
filtered_bigrams = bigram_freq[bigram_freq['bigram'].str.startswith('no ')]
filtered_bigrams = filtered_bigrams.sort_values(by='count', ascending=False)

# Display results
print("Most frequent bigrams starting with 'no':")
print(filtered_bigrams)


Most frequent bigrams starting with 'no':
               bigram  count
95217          no way     96
94885        no issue     84
94918         no long     73
95009      no problem     67
94880  no instruction     58
...               ...    ...
95183        no trace      1
95184     no tracking      1
95185         no trap      1
95186      no trigger      1
95187       no tripod      1

[655 rows x 2 columns]


<b>Comment: </b>Eliminating "no" from the vocabulary could affect the interpretaion of the sentiment of some phrases, e.g. no issue, no problem. Both "not" and "no" will be excluded from stopwords.

### Streamline the preprocessing of the train and test sets

In [65]:
# Load the train and test sets
train = pd.read_csv('./train.csv', sep=';')
test = pd.read_csv('./test.csv', sep=';')

In [66]:
print(train.isnull().sum())
print(test.isnull().sum())

rating           0
combined_text    0
dtype: int64
rating           0
combined_text    0
dtype: int64


In [67]:
def normalize_and_preprocess(df, column_name, rating_column, custom_stopwords=None, nlp=None):
    """
    Combine normalization and preprocessing into one function.
    - Converts text to lowercase.
    - Removes HTML tags.
    - Normalizes text to ASCII (removes accents and special characters).
    - Lemmatizes text and filters stopwords and punctuation.

    Returns a new DataFrame with the preprocessed text and the original rating column.
    """
    if custom_stopwords is None:
        custom_stopwords = []

    # Normalize the text
    normalized_text = df[column_name].str.lower()
    normalized_text = normalized_text.apply(lambda x: BeautifulSoup(str(x), "html.parser").get_text())
    normalized_text = normalized_text.apply(lambda x:
        unicodedata.normalize('NFKD', x).encode('ascii', 'ignore').decode('utf-8', 'ignore'))

    # Preprocess the text
    preprocessed_text = []
    for doc in nlp.pipe(normalized_text, batch_size=500, disable=["ner", "parser"]):
        processed_tokens = [
            token.lemma_ for token in doc
            if token.is_alpha
            and not token.is_punct
            and token.lemma_ not in custom_stopwords
        ]
        preprocessed_text.append(' '.join(processed_tokens))

    # Create a new DataFrame with the preprocessed text and the original rating column
    result_df = df[[rating_column]].copy()  # Copy the original rating column
    result_df['text'] = preprocessed_text  # Add the preprocessed text

    return result_df


In [68]:
# Preprocessed train and test sets
train_processed = normalize_and_preprocess(
    train,
    column_name='combined_text',
    rating_column='rating',
    custom_stopwords=custom_stopwords,
    nlp=nlp)

test_processed = normalize_and_preprocess(
    test,
    column_name='combined_text',
    rating_column='rating',
    custom_stopwords=custom_stopwords,
    nlp=nlp)


  normalized_text = normalized_text.apply(lambda x: BeautifulSoup(str(x), "html.parser").get_text())
  normalized_text = normalized_text.apply(lambda x: BeautifulSoup(str(x), "html.parser").get_text())


In [69]:
# Save output to csv
train_processed.to_csv('./train_preproc.csv', sep=';', index=False)
test_processed.to_csv('./test_preproc.csv', sep=';',index=False)