<a href="https://colab.research.google.com/github/fundaylncii/NaturalLanguageProcessing/blob/main/NLPTextPreprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Bilgisayarların dili insan diline en yakın şekilde kullanmaya çalışması çalışmalarıdır.

Metin analizleri gerçekleştirilir.

Chatbotlar, duygu analizleri, dil çeviri modelleri vb.

Adımlar:
Text Preprocessing
Text Visualization
Sentiment Engineering
Feature Engineering
Sentiment Modeling



In [None]:
 pip install nltk textblob wordcloud



In [1]:
from warnings import filterwarnings
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from PIL import Image
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, GridSearchCV, cross_validate
from sklearn.preprocessing import LabelEncoder
from textblob import TextBlob
from wordcloud import WordCloud


In [2]:
filterwarnings("ignore")
pd.set_option("display.max_columns", None)
pd.set_option("display.width",200)
pd.set_option("display.float_format", lambda x: "%.2f" % x)

In [3]:
## TEXT PREPROCESSING

df = pd.read_csv("/content/amazon_reviews.csv")

In [4]:
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,day_diff,helpful_yes,total_vote
0,A3SBTW3WS4IQSN,B007WTAJTO,,"[0, 0]",No issues.,4.0,Four Stars,1406073600,2014-07-23,138,0,0
1,A18K1ODH1I2MVB,B007WTAJTO,0mie,"[0, 0]","Purchased this for my device, it worked as adv...",5.0,MOAR SPACE!!!,1382659200,2013-10-25,409,0,0
2,A2FII3I2MBMUIA,B007WTAJTO,1K3,"[0, 0]",it works as expected. I should have sprung for...,4.0,nothing to really say....,1356220800,2012-12-23,715,0,0
3,A3H99DFEG68SR,B007WTAJTO,1m2,"[0, 0]",This think has worked out great.Had a diff. br...,5.0,Great buy at this price!!! *** UPDATE,1384992000,2013-11-21,382,0,0
4,A375ZM4U047O79,B007WTAJTO,2&amp;1/2Men,"[0, 0]","Bought it with Retail Packaging, arrived legit...",5.0,best deal around,1373673600,2013-07-13,513,0,0


In [5]:
## NORMALIZING CASE FOLDING

## Büyük küçük harflerin normalizasyonu

df["reviewText"] = df["reviewText"].str.lower()
df["reviewText"]

Unnamed: 0,reviewText
0,no issues.
1,"purchased this for my device, it worked as adv..."
2,it works as expected. i should have sprung for...
3,this think has worked out great.had a diff. br...
4,"bought it with retail packaging, arrived legit..."
...,...
4910,i bought this sandisk 16gb class 10 to use wit...
4911,used this for extending the capabilities of my...
4912,great card that is very fast and reliable. it ...
4913,good amount of space for the stuff i want to d...


In [6]:
## PUNCTUATIONS
## Ölçüm niteliği taşımayan diğer ifadeler textlerden silinmelidir. (sayılar, noktalama işaretleri vb)
## regular expression

df["reviewText"] = df["reviewText"].str.replace("[^\w\s]", "",regex=True)
df["reviewText"]

Unnamed: 0,reviewText
0,no issues
1,purchased this for my device it worked as adve...
2,it works as expected i should have sprung for ...
3,this think has worked out greathad a diff bran...
4,bought it with retail packaging arrived legit ...
...,...
4910,i bought this sandisk 16gb class 10 to use wit...
4911,used this for extending the capabilities of my...
4912,great card that is very fast and reliable it c...
4913,good amount of space for the stuff i want to d...


In [7]:
## NUMBERS
## ölçüm değeri olmayan sayılardan kurtulma (olaydan olaya değişkenlik gösterir kimi durumlarda sayılar önem taşıyabilir)
df["reviewText"] = df["reviewText"].str.replace("\d","",regex=True)
df["reviewText"]

Unnamed: 0,reviewText
0,no issues
1,purchased this for my device it worked as adve...
2,it works as expected i should have sprung for ...
3,this think has worked out greathad a diff bran...
4,bought it with retail packaging arrived legit ...
...,...
4910,i bought this sandisk gb class to use with my...
4911,used this for extending the capabilities of my...
4912,great card that is very fast and reliable it c...
4913,good amount of space for the stuff i want to d...


In [8]:
## STOP WORDS
## Dilde yaygın kullanılan kelimelerden kurtulma (is, of, the vb.)

import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [9]:
sw = stopwords.words("english")
print(sw)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [10]:
df["reviewText"] = df["reviewText"].apply(lambda x: " ".join(x for x in str(x).split() if x not in sw))
df["reviewText"]

Unnamed: 0,reviewText
0,issues
1,purchased device worked advertised never much ...
2,works expected sprung higher capacity think ma...
3,think worked greathad diff bran gb card went s...
4,bought retail packaging arrived legit orange e...
...,...
4910,bought sandisk gb class use htc inspire months...
4911,used extending capabilities samsung galaxy not...
4912,great card fast reliable comes optional adapte...
4913,good amount space stuff want fits gopro say


In [11]:
## RARE WORDS
## Nadir kelimelerin text içerisinde belirli bir örüntü oluşturamayacağı için çıkartılması

## her kelimelerin text içerisindeki frekansları belirlenir

temp_df = pd.Series(" ".join(df["reviewText"]).split()).value_counts()
temp_df


Unnamed: 0,count
card,4603
gb,1725
phone,1685
works,1559
great,1445
...,...
sharper,1
cheapos,1
gate,1
measuring,1


In [12]:
## frekansı 1 ve 1 den küçük olanların belirlenmesi

drop = temp_df[temp_df <= 1]
drop

Unnamed: 0,count
conclusionmy,1
mbsbuy,1
flimsier,1
disappointedtry,1
priceokay,1
...,...
sharper,1
cheapos,1
gate,1
measuring,1


In [13]:
df["reviewText"] = df["reviewText"].apply(lambda x: " ".join(x for x in str(x).split() if x not in drop))
df["reviewText"]

Unnamed: 0,reviewText
0,issues
1,purchased device worked advertised never much ...
2,works expected higher capacity think made bit ...
3,think worked gb card went south one held prett...
4,bought retail packaging arrived legit envelope...
...,...
4910,bought sandisk gb class use htc inspire months...
4911,used capabilities samsung galaxy note greatly ...
4912,great card fast reliable comes optional adapte...
4913,good amount space stuff want fits gopro say


In [14]:
## TOKENIZATION:
## Cümlerleri tokenlarına ayırmak : cümleleri parçalamak

nltk.download("punkt")
nltk.download("punkt_tab")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [15]:
df["reviewText"].apply(lambda x:TextBlob(x).words).head()

Unnamed: 0,reviewText
0,[issues]
1,"[purchased, device, worked, advertised, never,..."
2,"[works, expected, higher, capacity, think, mad..."
3,"[think, worked, gb, card, went, south, one, he..."
4,"[bought, retail, packaging, arrived, legit, en..."


In [20]:
## LEMMATIZATION

## Kelimeleri köklerine ayırma işlemidir.(gözlük - gözlükçü vb)
## STEMMING

nltk.download("wordnet")
from nltk.stem import WordNetLemmatizer


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [21]:
df["reviewText"] = df["reviewText"].apply(lambda x: " ".join([WordNetLemmatizer().lemmatize(word) for word in x.split()]))
df["reviewText"]

Unnamed: 0,reviewText
0,issue
1,purchased device worked advertised never much ...
2,work expected higher capacity think made bit e...
3,think worked gb card went south one held prett...
4,bought retail packaging arrived legit envelope...
...,...
4910,bought sandisk gb class use htc inspire month ...
4911,used capability samsung galaxy note greatly ex...
4912,great card fast reliable come optional adapter...
4913,good amount space stuff want fit gopro say
