### UAS PMDM

In [1]:
import os
import re
import sys
import string
import modSpellChecker_1 as sc
from contractions_1 import CONTRACTION_MAP
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
import numpy as np
import pandas as pd
import operator

In [2]:
character = ['z','y','x','w','v','u','t','s','r','q','p','o','n','m','l','k','j','i','h','g','f','e','d',',',';',':','-','...','?','!', '(',')','[',']','{','}','<','>', '"','/','\'','#','-','@']

def repeatcharNormalize(text):
    for i in range(len(character)):
        charac_long = 5
        while charac_long >= 2:
            char = character[i] * charac_long
            text = text.replace(char, character[i])
            charac_long -= 1
    return text

def spellNormalize(text):
    spellCheck = []
    for i in text:
        if i not in character:
            j = sc.correction(i)
            spellCheck.append(j)
        else:
            spellCheck.append(i)
    return spellCheck

def tokenize_text(text):
    tokens = nltk.word_tokenize(text)
    tokens = [token.strip() for token in tokens]
    return tokens

def expand_contractions(text, contraction_mapping):
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
                                      flags=re.IGNORECASE | re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match) if contraction_mapping.get(match) else contraction_mapping.get(match.lower())
        expanded_contraction = first_char + expanded_contraction[1:]
        return expanded_contraction

    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

def stemmer_text(text):
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    text = stemmer.stem(text)
    return text

In [3]:
def remove_special_characters(text):
    tokens = tokenize_text(text)
    pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
    filtered_tokens = filter(None, [pattern.sub('',token) for token in tokens])
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

In [4]:
factory = StopWordRemoverFactory()
stopword_list = factory.get_stop_words()

In [5]:
def remove_stopwords(text):
    tokens = tokenize_text(text)
    filtered_tokens = [token for token in tokens if token not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

In [6]:
def normalize_corpus(corpus, tokenize=False):
    normalized_corpus = []
    for text in corpus:
        text = expand_contractions(text, CONTRACTION_MAP)
        text = stemmer_text(text)
        text = remove_special_characters(text)
        text = repeatcharNormalize(text)
        text = remove_stopwords(text)
        normalized_corpus.append(text)
        if tokenize:
            text = tokenize_text(text)
            text = spellNormalize(text)
            normalized_corpus.append(text)
    return normalized_corpus

In [7]:
dataset = pd.read_csv('cnn_sport.csv')
dataset

Unnamed: 0,title,category,link
0,Klasemen Liga Inggris: Tottenham Gusur Man Cit...,Sepakbola,https://www.cnnindonesia.com/olahraga/20231024...
1,Khamzat Disebut Tak Pantas Dapat Duel Gelar Ju...,Olahraga Lainnya,https://www.cnnindonesia.com/olahraga/20231023...
2,"Hasil Liga Inggris: Son Cetak Gol, Tottenham H...",Sepakbola,https://www.cnnindonesia.com/olahraga/20231024...
3,Binder Merasa Kasihan Martin Gagal Menang di M...,Moto GP,https://www.cnnindonesia.com/olahraga/20231024...
4,FOTO: Tottenham Jaga Rekor Belum Terkalahkan U...,Sepakbola,https://www.cnnindonesia.com/olahraga/20231024...
...,...,...,...
1995,Jadwal Siaran Langsung Final Hong Kong Open: 3...,Raket,https://www.cnnindonesia.com/olahraga/20230917...
1996,Klasemen Liga Inggris Usai MU Terpuruk di Old ...,Sepakbola,https://www.cnnindonesia.com/olahraga/20230917...
1997,Ronaldo Tunjukkan Gairah Besar di Saudi Pro Le...,Sepakbola,https://www.cnnindonesia.com/olahraga/20230917...
1998,"2 Pemain Al Raed Berebut Tendang Penalti, Rona...",Sepakbola,https://www.cnnindonesia.com/olahraga/20230917...


In [8]:
feature = dataset.iloc[:,0]

In [9]:
feature[0:10]

0    Klasemen Liga Inggris: Tottenham Gusur Man Cit...
1    Khamzat Disebut Tak Pantas Dapat Duel Gelar Ju...
2    Hasil Liga Inggris: Son Cetak Gol, Tottenham H...
3    Binder Merasa Kasihan Martin Gagal Menang di M...
4    FOTO: Tottenham Jaga Rekor Belum Terkalahkan U...
5    INFOGRAFIS: Jadwal Timnas Indonesia di Kualifi...
6          Messi Banggakan Capaian Bersama Inter Miami
7    Erick Thohir Soal Persiapan Piala Dunia U-17: ...
8    Bagnaia Mulai Pongah Jelang MotoGP Thailand da...
9    Sidang Komdis PSSI: Hugo Samir Pukul Pemain Pe...
Name: title, dtype: object

In [10]:
norm_corpus = normalize_corpus(feature)
len(norm_corpus)

2000

In [11]:
norm_corpus

['klasemen liga ingris totenham gusur man city puncak',
 'khamzat sebut tak pantas duel gelar juara ufc',
 'hasil liga ingris son cetak gol totenham hajar fulham 20',
 'binder rasa kasihan martin gagal menang motogp australia',
 'foto totenham jaga rekor kalah usai hajar fulham',
 'infografis jadwal timnas indonesia kualifikasi piala dunia 2026',
 'mesi banga capai sama inter miami',
 'erick thohir soal siap piala dunia u17 fifa beri jempol',
 'bagnaia mulai pongah jelang motogp thailand malaysia',
 'sidang komdis psi hugo samir pukul main persib hukum 2 laga',
 'daftar main indonesia french open 2023',
 'bolden jadi main nba usai terima twoway contract bucks',
 'reaksi ibu welber jardim usai sang anak fasih bahasa indonesia',
 'psi ajak warga bandung tonton piala dunia u17 bukti gila bola',
 'bantai filipina timnas futsal putri indonesia juara turnamen invitasi',
 'link live streaming persija vs rans liga 1',
 'top 3 sports zarco juara motogp australia makhachev menang ko',
 'chow yun