<a href="https://colab.research.google.com/github/fadhan654/MachineLearning/blob/main/ML13.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
def to_lowercase(text):
    return text.lower()

# Contoh penggunaan
sample_text = "Fadhlan going to Market."
print(to_lowercase(sample_text))

fadhlan going to market.


In [2]:
import string

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

# Contoh penggunaan
sample_text = "Don't go away, Stay tune!"
print(remove_punctuation(sample_text))

Dont go away Stay tune


In [3]:
import re

def remove_numbers(text):
    return re.sub(r'\d+', '', text)

# Contoh penggunaan
sample_text = "I have 4 children, 2 boys and 2 girls "
print(remove_numbers(sample_text))

I have  children,  boys and  girls 


In [4]:
from nltk.tokenize import word_tokenize
import nltk

nltk.download('punkt')

def tokenize(text):
    return word_tokenize(text)

# Contoh penggunaan
sample_text = "Welcome to the jungle, yeayyy."
print(tokenize(sample_text))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


['Welcome', 'to', 'the', 'jungle', ',', 'yeayyy', '.']


In [5]:
from nltk.corpus import stopwords

nltk.download('stopwords')

def remove_stopwords(words):
    stop_words = set(stopwords.words('english'))
    return [word for word in words if word not in stop_words]

# Contoh penggunaan
sample_text = "I am a student at Nusa Putra University"
tokenized_text = tokenize(sample_text)
print(remove_stopwords(tokenized_text))

['I', 'student', 'Nusa', 'Putra', 'University']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [6]:
from nltk.stem import PorterStemmer

def stem_words(words):
    ps = PorterStemmer()
    return [ps.stem(word) for word in words]

# Contoh penggunaan
sample_text = "They are my friends"
tokenized_text = tokenize(sample_text)
filtered_words = remove_stopwords(tokenized_text)
print(stem_words(filtered_words))

['they', 'friend']


In [9]:
sentence1 = "I love badminton"
sentence2 = "Tontowi Ahmad great badminton player"
sentence3 = "Tontowi Ahmad has won one"

In [10]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
docs = [sentence1, sentence2, sentence3]
print(docs)

['I love badminton', 'Tontowi Ahmad great badminton player', 'Tontowi Ahmad has won one']


In [11]:
#Mendefinisikan dan menyesuaikan count vectorizer pada dokumen.

vec = CountVectorizer()
X = vec.fit_transform(docs)
#Mengonversi vektor pada DataFrame menggunakan pandas

df = pd.DataFrame(X.toarray(),
    columns=vec.get_feature_names_out())
df.head()

Unnamed: 0,ahmad,badminton,great,has,love,one,player,tontowi,won
0,0,1,0,0,1,0,0,0,0
1,1,1,1,0,0,0,1,1,0
2,1,0,0,1,0,1,0,1,1


In [12]:
import numpy as np
from collections import Counter
from math import log

# Tiga dokumen dalam korpus
documents = [
    "Ginting beats Kento Momota in badminton match",
    "Kento Momota's badminton rank is below Ginting"
    ]

# Preprocessing: Lowercasing and tokenizing
tokenized_documents = [doc.lower().split() for doc in documents]

# Menghitung TF
def compute_tf(tokenized_doc):
    tf_dict = {}
    term_count = Counter(tokenized_doc)
    total_terms = len(tokenized_doc)
    for term, count in term_count.items():
        tf_dict[term] = count / total_terms
    return tf_dict

tf_list = [compute_tf(doc) for doc in tokenized_documents]

print("Term Frequency (TF):")
for idx, tf in enumerate(tf_list):
    print(f"Document {idx + 1} TF:")
    for term, score in tf.items():
        print(f"    {term}: {score:.4f}")

Term Frequency (TF):
Document 1 TF:
    ginting: 0.1429
    beats: 0.1429
    kento: 0.1429
    momota: 0.1429
    in: 0.1429
    badminton: 0.1429
    match: 0.1429
Document 2 TF:
    kento: 0.1429
    momota's: 0.1429
    badminton: 0.1429
    rank: 0.1429
    is: 0.1429
    below: 0.1429
    ginting: 0.1429


In [13]:
# Menghitung IDF
def compute_idf(tokenized_docs):
    idf_dict = {}
    total_docs = len(tokenized_docs)
    all_terms = set(term for doc in tokenized_docs for term in doc)
    for term in all_terms:
        doc_containing_term = sum(1 for doc in tokenized_docs if term in doc)
        idf_dict[term] = log(total_docs / (1 + doc_containing_term)) + 1
    return idf_dict

idf_dict = compute_idf(tokenized_documents)

print("\nInverse Document Frequency (IDF):")
for term, score in idf_dict.items():
    print(f"    {term}: {score:.4f}")


Inverse Document Frequency (IDF):
    ginting: 0.5945
    rank: 1.0000
    is: 1.0000
    match: 1.0000
    below: 1.0000
    badminton: 0.5945
    momota's: 1.0000
    in: 1.0000
    momota: 1.0000
    kento: 0.5945
    beats: 1.0000


In [16]:
# Menghitung TF-IDF
def compute_tfidf(tf_list, idf_dict):
    tfidf_list = []
    for tf in tf_list:
        tfidf_dict = {}
        for term, tf_value in tf.items():
            tfidf_dict[term] = tf_value * idf_dict.get(term, 0)
        tfidf_list.append(tfidf_dict)
    return tfidf_list

tfidf_list = compute_tfidf(tf_list, idf_dict)

print("\nTF-IDF:")
for idx, tfidf in enumerate(tfidf_list):
    print(f"Document {idx + 1} TF-IDF:")
    for term, score in tfidf.items():
        print(f"    {term}: {score:.4f}")


TF-IDF:
Document 1 TF-IDF:
    ginting: 0.0849
    beats: 0.1429
    kento: 0.0849
    momota: 0.1429
    in: 0.1429
    badminton: 0.0849
    match: 0.1429
Document 2 TF-IDF:
    kento: 0.0849
    momota's: 0.1429
    badminton: 0.0849
    rank: 0.1429
    is: 0.1429
    below: 0.1429
    ginting: 0.0849


In [15]:
from gensim.models import Word2Vec
import numpy as np

corpus = [
    "Ginting is the first rank.",
    "second rank is Axelsen.",
    "And the third rank is Kento Momota."
]

sentences = [doc.split() for doc in corpus]
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

def document_vector(doc):
    return np.mean([model.wv[word] for word in doc.split() if word in model.wv], axis=0)

doc_vectors = [document_vector(doc) for doc in corpus]
print(doc_vectors)

[array([ 2.2522365e-03,  6.4851937e-04, -5.1531941e-05,  2.3106909e-03,
       -2.0476163e-03, -3.9441790e-03, -1.3600638e-03,  5.6396397e-03,
       -4.9265120e-03, -2.7241032e-03,  3.5401578e-03, -1.7015550e-03,
        1.9902207e-03,  1.4584693e-04,  1.9163847e-03, -3.3103675e-05,
        2.2207857e-03, -1.6701591e-03, -2.3737033e-03, -7.6351985e-03,
       -1.2437934e-03, -1.2981085e-03,  1.3057077e-03, -4.3931203e-03,
        1.1771233e-03, -3.2708594e-03, -1.4812399e-03,  4.6744170e-03,
       -1.1120265e-03, -9.3837174e-05, -6.5763292e-05, -2.1459344e-03,
        3.2268316e-03,  1.5489327e-03, -2.5710168e-03,  1.4511810e-03,
        2.5768352e-03, -7.0543494e-05, -2.2386808e-03, -1.4686211e-03,
       -4.1319234e-03,  2.1915704e-03, -3.4602578e-03,  2.5354628e-03,
       -1.0085892e-03, -5.5950909e-04, -5.5201852e-04,  2.3073819e-03,
       -1.0541726e-03,  5.0537498e-03, -1.3581451e-03,  6.6939119e-04,
       -2.6750148e-03,  1.5135758e-03, -1.3748379e-04, -3.7581965e-03,
     

In [17]:
!pip install numpy pandas scikit-learn



In [18]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn import metrics

# 1. Mengumpulkan data
newsgroups = fetch_20newsgroups(subset='all')

# 2. Preprocessing data
# Tidak perlu preprocessing khusus karena kita akan menggunakan TfidfVectorizer

# 3. Membagi data menjadi training dan testing set
X_train, X_test, y_train, y_test = train_test_split(newsgroups.data, newsgroups.target, test_size=0.25, random_state=42)

# 4. Melatih model
# Membuat pipeline yang mencakup TfidfVectorizer dan MultinomialNB
model = make_pipeline(TfidfVectorizer(), MultinomialNB())

# Melatih model menggunakan training set
model.fit(X_train, y_train)

# 5. Mengevaluasi model
# Prediksi pada testing set
y_pred = model.predict(X_test)

# Evaluasi kinerja model
print(f"Accuracy: {metrics.accuracy_score(y_test, y_pred)}")
print("Classification Report:")
print(metrics.classification_report(y_test, y_pred, target_names=newsgroups.target_names))

# Confusion Matrix
print("Confusion Matrix:")
print(metrics.confusion_matrix(y_test, y_pred))

Accuracy: 0.8425297113752123
Classification Report:
                          precision    recall  f1-score   support

             alt.atheism       0.88      0.72      0.79       198
           comp.graphics       0.86      0.79      0.82       245
 comp.os.ms-windows.misc       0.88      0.83      0.85       242
comp.sys.ibm.pc.hardware       0.66      0.86      0.75       238
   comp.sys.mac.hardware       0.95      0.84      0.89       250
          comp.windows.x       0.96      0.80      0.87       260
            misc.forsale       0.96      0.66      0.78       241
               rec.autos       0.89      0.93      0.91       244
         rec.motorcycles       0.91      0.95      0.93       219
      rec.sport.baseball       0.96      0.94      0.95       261
        rec.sport.hockey       0.90      0.98      0.94       245
               sci.crypt       0.78      0.98      0.87       251
         sci.electronics       0.92      0.80      0.86       249
                 sci.me