In [1]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [3]:
origin_df = pd.read_json("Software_5.json", lines=True)
df = origin_df[['overall', 'reviewText', 'summary']]

# Convert to lower cases and tokenize
df.loc[:, 'reviewText'] = df['reviewText'].apply(lambda x: str(x).lower())
df.loc[:, 'summary'] = df['summary'].apply(lambda x: str(x).lower())

# Remove special characters
df.loc[:, 'reviewText'] = df['reviewText'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', x))
df.loc[:, 'summary'] = df['summary'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', x))

# Remove stopwords
stopwords = nltk.corpus.stopwords.words('english')
df.loc[:, 'reviewText'] = df['reviewText'].apply(lambda x: ' '.join([word for word in str(x).split() if word not in (stopwords)]))
df.loc[:, 'summary'] = df['summary'].apply(lambda x: ' '.join([word for word in str(x).split() if word not in (stopwords)]))

# Lemmatization
lemmatizer = nltk.stem.WordNetLemmatizer()
df.loc[:, 'reviewText'] = df['reviewText'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))
df.loc[:, 'summary'] = df['summary'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

df = df.rename(columns={'overall': 'rating', 'reviewText': 'review'})

In [4]:
df.head(20)

Unnamed: 0,rating,review,summary
0,4,ive using dreamweaver predecessor macromedias ...,solid overview dreamweaver cs5
1,4,demo done pc version reference mac version nee...,good value
2,5,youve wanting learn create website either lack...,excellent software want use crash refresher co...
3,5,ive creating website dreamweaver year experien...,fantastic overview dream weaver web design gen...
4,5,decided trying number product switch gold stan...,excellent tutorial
5,5,video wellpaced delivered understandable manne...,excellent
6,5,spent several hour lesson love detailed clear ...,excellent video training material
7,5,dreamweaver mx2004 since came back spent year ...,great video difficult time program
8,5,also taken local community college online cour...,excellent value price
9,5,even though use dreamweaver great deal several...,buy copy dreamweaver


In [5]:
df.to_csv("cleaned_comments.csv", index=False)
df.to_json("cleaned_comments.json", orient='records', lines=True, indent=4)