In [None]:
import pandas as pd
import re

file_path_csv = 'vdoLinks.csv'
file_path_text = 'NoisyText.txt'

# Read the content of the text file
try:
    with open(file_path_text, 'r', encoding='utf-8') as f:
        raw_data = f.read()
except UnicodeDecodeError:
    # If UTF-8 fails, use latin-1
    with open(file_path_text, 'r', encoding='latin-1') as f:
        raw_data = f.read()

In [None]:
# Read the CSV file 
movie_list = pd.read_csv(file_path_csv)

print(movie_list.head(5))

     youtubeId  movieId                               title
0  K26_sDKnvMU        1                    Toy Story (1995)
1  3LPANjHlPxo        2                      Jumanji (1995)
2  rEnOoWs3FuA        3             Grumpier Old Men (1995)
3  j9xml1CxgXI        4            Waiting to Exhale (1995)
4  ltwvKLnj1B4        5  Father of the Bride Part II (1995)


In [None]:
# List of slang words found
slang_words = [
    'fuck', 'fucking', 'troll', 'boob', 'hot', 'drugs', 'killing', 'fucking hot', 'ass', 'shit', 'shitty', 'damn', 
    'bullshit', 'idiot', 'crap', 'crappy', 'holy crap', 'prick', 'dumbass', 'sucked', 'dick', 'dicks', 'pwned', 'hell', 
    'stupid', 'fucked', 'badass', 'whack', 'faggot', 'pissed', 'douchbag', 'shithead', 'fucks', 'bitch', 'slut', 'whore', 
    'fucker', 'scum', 'scumbag', 'twat', 'retarded', 'retard', 'nasty', 'pussy', 'goddamn', 'dumb', 'guts', 'screw', 'jackass', 
    'asshole', 'insane', 'horny'
]

# Dictionary to store the counts of each found slang word
slang_counts = {}

# to find all occurrences of the slang words
found_words = re.findall(r'\b(?:' + '|'.join(slang_words) + r')\b', raw_data, re.IGNORECASE)

# to count the occurrences of each unique word found
for word in set(found_words):
    slang_counts[word.lower()] = found_words.count(word)

print("Slang word counts:")
print(slang_counts)

Slang word counts:
{'fuck': 210, 'dumb': 49, 'asshole': 16, 'shit': 441, 'crappy': 28, 'prick': 1, 'crap': 3, 'stupid': 173, 'hell': 16, 'fucking': 284, 'jackass': 1, 'shithead': 1, 'whack': 3, 'bullshit': 5, 'badass': 1, 'killing': 47, 'nasty': 16, 'sucked': 52, 'fucker': 2, 'douchbag': 1, 'guts': 10, 'scumbag': 6, 'pussy': 7, 'horny': 7, 'troll': 9, 'fucked': 3, 'damn': 67, 'bitch': 6, 'idiot': 2, 'holy crap': 6, 'drugs': 1, 'insane': 1, 'whore': 1, 'shitty': 5, 'goddamn': 3, 'twat': 1, 'dumbass': 11, 'hot': 124, 'slut': 3, 'screw': 3, 'scum': 6, 'pissed': 18, 'dick': 21, 'ass': 7, 'retarded': 23, 'dicks': 5, 'boob': 2, 'faggot': 3, 'pwned': 1, 'fucks': 15}


In [None]:
# dictionary mapping youtubeId to title
id_to_title = dict(zip(movie_list['youtubeId'], movie_list['title']))

# List of all the noise patterns to check for
noise_patterns = ['HttpError', 'charmap', 'pt1m', 'video has disabled comments', '6827 9 1 0 0']

# Split the file into individual movie sections
movie_sections = re.split(r'NewMovieDrPQRd\s*\n', raw_data.strip())

processed_data = []

for section in movie_sections:
    if not section.strip():
        continue
    
    lines = section.strip().split('\n')
    youtube_id = lines[0].strip()
    title = id_to_title.get(youtube_id, 'Title not found')
    
      # Check if any of the noise patterns are in the current section
    has_noise = any(pattern in section for pattern in noise_patterns)
    
    if has_noise:
        comments = "No comments were found"
    else:
        # Extract and join valid comments
        comments_list = [
            line for line in lines[1:] if line.strip() and not re.match(r'^[a-zA-Z0-9_-]{11}$', line.strip()) 
            and not re.match(r"^\s*<HttpError.*", line.strip()) and not line.strip().startswith ("'charmap' codec can't encode")
        ]
        comments = ' '.join(comments_list).strip()

    processed_data.append({
        'youtubeId': youtube_id,
        'title': title,
        'comments': comments
    })

print('Sample Movies and Comments')
print()
for item in processed_data[:3]:
    print(f"Movie: {item['title']}")
    print(f"The comments are:\n\n{item['comments']}\n")

Sample Movies and Comments

Movie: Toy Story (1995)
The comments are:

No comments were found

Movie: Jumanji (1995)
The comments are:

No comments were found

Movie: Grumpier Old Men (1995)
The comments are:

I was looking for halloween themed movies and stumbled over this... is there anything halloween related in this film? Seen it years ago but can't remember squat welp, been 3 years since anybody have commented Well, there was supposed to be another sequel. From what I heard, the two guys go to Italy (to meet their new relatives, I suppose) & wind up meeting Italian versions of themselves. i loved the first one is it like the first one or not?? 128238 170 7 0 7 The more things change, the more they stay the same in Wabasha, Minnesota. The uncatchable fish named Catfish Hunter grows fatter. The wisecracks, zingers and put downs pile up like freshly raked leaves. And GRUMPY OLD MEN become grumpier in the sequel that's "pure delight, a wonderfully warmhearted comedy" (David Sheehan, C

In [None]:
#for red text and resetting the color
RED = '\033[91m'
RESET = '\033[0m'

# clean slang words
cleaned_data_list = []
for item in processed_data:
    comments = item['comments']
    # Replace slang words with **
    for slang in slang_words:
        comments = re.sub(r'\b' + re.escape(slang) + r'\b', f"{RED}**{RESET}", comments, flags=re.IGNORECASE)
    
    cleaned_data_list.append({
        'youtubeId': item['youtubeId'],
        'title': item['title'],
        'comments': comments
    })

# movies with comments 
commented_movies = [item for item in cleaned_data_list if '**' in item['comments']]

# sample movies for output
sample_movies = commented_movies[:3] 

# Print sample movies with slang words
print("Sample Movies with slang words cleaned:")
for movie in sample_movies:
    print ()
    print(f"Movie: {movie['title']}")
    print(f"The comments are:\n\n{movie['comments']}\n")

final_output = ""
for movie in cleaned_data_list:
    final_output += f"{movie['title']}\n"
    final_output += f"{movie['comments']}\n\n"

Sample Movies with slang words cleaned:

Movie: Tom and Huck (1995)
The comments are:

So this is what my English teacher is going to make us watch What was the name of the music that was playing in the background in this Tom and Huck trailer video, hillaryannmarina? I was forced to watch this against my will The worst adaptation of Mark Twain's novel 2 ever come 2 the big screen, Elijah Wood's performance as Huckleberry Finn was more believable than Brad Renfro's [91m**[0m Ms Cervi I remember seeing this film at My Summer Camp. And Yes, I did read The Adventures of Tom Sawyer at My Middle School as Part of The Start to Finish Book Series. I remember seeing this preview on my old Disney VHS...I think it was Pocahontas. Never saw the film though. this movie [91m**[0m a [91m**[0m [91m**[0m and i cant believe my [91m**[0m child self went to see it in theaters his (brad's) spirit lives on.  just call to him.  eeh, this is just a witch's kind of thing, though. There black man frie

In [None]:
# Count all alphabetic characters in the Noisytext.txt file
file_path_text = 'NoisyText.txt'
letters_before = sum(1 for char in raw_data if char.isalpha())

print(f"LETTERSbefore: {letters_before}")

LETTERSbefore: 11554895


In [None]:
# Count letters in the cleaned output
letters_after = sum(1 for char in final_output if char.isalpha())

print(f"LETTERSafter: {letters_after}")

LETTERSafter: 2600234


In [None]:
# Calculate and print the ratio
ratio = letters_after / letters_before

print(f"Ratio (LETTERSafter/LETTERSbefore): {ratio:.4f}")

if ratio < 1:
    print("This ratio is less than 1, which indicates that one or more noisy text entries were removed.")

Ratio (LETTERSafter/LETTERSbefore): 0.2250
This ratio is less than 1, which indicates that one or more noisy text entries were removed.
