In [4]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import emoji

# Load the data
file_path = r"C:\Users\Elakkiya\Downloads\flipkart\oneplus_merged.csv"
df = pd.read_csv(file_path)

# Handle missing values
# Drop rows where the Reviews text is missing
df.dropna(subset=['Review'], inplace=True)

# Ensure all entries in the Reviews column are strings
df['Review'] = df['Review'].astype(str)

# Remove emojis using the emoji library
def remove_emojis(text):
    return emoji.replace_emoji(text, replace='')

# Remove special characters and digits
def remove_special_characters(text):
    return re.sub('[^a-zA-Z\s]', '', text)

# Normalize the text
def normalize_text(text):
    text = text.lower()  # Convert to lowercase
    text = remove_emojis(text)
    text = remove_special_characters(text)
    text = re.sub('\s+', ' ', text)  # Remove extra spaces
    return text

# Apply normalization
df['Cleaned_Review'] = df['Review'].apply(normalize_text)

# Remove stopwords
stop_words = set(stopwords.words('english'))
df['Cleaned_Review'] = df['Cleaned_Review'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

# Apply lemmatization
lemmatizer = WordNetLemmatizer()
df['Cleaned_Review'] = df['Cleaned_Review'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

# Save the cleaned data to a new CSV file
output_file_path = r"C:\Users\Elakkiya\Downloads\flipkart\oneplus_reviews_cleaned.csv"
df.to_csv(output_file_path, index=False)

print("Data cleaning completed and saved to", output_file_path)


Data cleaning completed and saved to C:\Users\Elakkiya\Downloads\flipkart\oneplus_reviews_cleaned.csv


In [1]:
import pandas as pd
from autocorrect import Speller

# Load the data
file_path = r"C:\Users\Elakkiya\Downloads\flipkart\oneplus_reviews_cleaned.csv"
df = pd.read_csv(file_path)# Apply autocorrect using autocorrect library
spell = Speller(lang='en')

def correct_spelling(text):
    return ' '.join([spell(word) for word in text.split()])

df['Autocorrected_Review'] = df['Cleaned_Review'].apply(correct_spelling)

# Save the cleaned data to a new CSV file
output_file_path = r"C:\Users\Elakkiya\Downloads\flipkart\oneplus_reviews_autocorrected.csv"
df.to_csv(output_file_path, index=False)

print("Data cleaning and spell correction completed and saved to", output_file_path)

Data cleaning and spell correction completed and saved to C:\Users\Elakkiya\Downloads\flipkart\oneplus_reviews_autocorrected.csv


In [6]:
import pandas as pd
import re

# Load the cleaned data
file_path = r"C:\Users\Elakkiya\Downloads\flipkart\oneplus_reviews_autocorrected.csv"
df = pd.read_csv(file_path)

# Create a new column 'Reviews' based on 'Autocorrected_Reviews'
df['Reviews'] = df['Autocorrected_Review']

# Dictionary of specific abbreviations and their corrections
abbreviation_dict = {
    'camara': 'camera',
    'fps': 'frames per second',
    'oplus': 'oneplus',
    'sd gen': 'snapdragon generation',
    'superb': 'super',
    'professor': 'processor',
    'sound college': 'sound quality',
    'dam': 'damn',
    'opp': 'oppo',
    'aws': 'awesome',
    'vry gd': 'very good',
    'supppoppp': 'super',
    'prosesar': 'processor',
    'approx hr sot': 'approximately hour screen on time',
    'u': 'you',
    'hr': 'hour',
    'prosper': 'processor',
    'usa': 'usage',
    'good pro': 'good product',
    'gd': 'good',
    'nice prod': 'nice product',
    'p': 'performance',
    'ui': 'user interface',
    'day us': 'day usage',
    'prosesar': 'processor',
    'flipcart': 'flipkart',
    'battery beast': 'battery best',
    'ease': 'easy',
    'pub': 'pubg',
    'osm': 'awesome',
    'worth karma worthuuu': 'worth the money',
    'ois': 'optical image stabilization',
    'aim happy': 'i am happy',
    'pub cod': 'pubg, call of duty',
    'hz': 'hertz',
    'degree census': 'degree celsius',
    'nd': 'and',
    'hit issue': 'heating issue',
    'ai': 'artificial intelligence',
    'wil': 'will',
    'tax': 'thanks',
    'anthem': 'item',
    'mint camera': 'main camera',
    'heat throttle': 'heating and throttling',
    'ill update': 'i will update',
    'phone devils aswoome': 'phone device is awesome',
    'hdr': 'high dynamic range',
    'fo': 'for',
    'op ti really': 'oneplus really',
    'one plus ph': 'oneplus phone',
    'spr': 'super',
    'apprehension': 'appreciation',
    'max': 'maximum',
    'oxygen o': 'oxygen os',
    'kinda': 'kind of',
    'medio': 'mediocre',
    'btw': 'by the way',
    'assam': 'awesome',
    'ph': 'phone',
    'bos': 'range boost',
    'beast': 'best',
    'dlr': 'dslr',
    'batter': 'better',
    'cod': 'call of duty',
    'nyc': 'nice',
    'unvilevebale item': 'unbelievable item',
    'supper': 'super',
    'op': 'oneplus',
    'le':'less',
    'assume': 'awesome',
    'osm dolly atoms': 'awesome dolby atmos',
    'mic failed': 'microphone failed',
    'premium paper': 'premium feel',
    'ver': 'very',
    'gonna': 'going to',
    'pub': 'pubg',
    'gun': 'good',
    'jus': 'just',
    'ive': 'i have',
    'extent': 'excellent',
    'nice hoon': 'nice phone',
    'gr': 'great',
    'math': 'match',
    'bt': 'but',
    'ok':'okay'
}

# Replace abbreviations in the Reviews column
for abbrev, full_form in abbreviation_dict.items():
    # Use word boundaries to ensure only whole words are replaced
    df['Reviews'] = df['Reviews'].str.replace(r'\b' + re.escape(abbrev) + r'\b', full_form, regex=True)

# List of unwanted words
unwanted_words = ['euuuuuuuuuuuuuu', 'imei','xx','o', 'p', 'k', 'x', 'w', 'st', 'isbn', 'gb', 'min', 'mah', 'x x x', 'makhan','bim', 'rd', 'r', 'n', 'nd', 'mm', 'fish', 'apple ka bar', 'mp', 'ppi', 'yrr', 'imei']

# Remove unwanted words from the Reviews column
def remove_unwanted_words(text):
    pattern = r'\b(' + '|'.join(re.escape(word) for word in unwanted_words) + r')\b'
    return re.sub(pattern, '', text)

df['Reviews'] = df['Reviews'].apply(remove_unwanted_words)

# Save the updated DataFrame back to CSV
output_file_path = r"C:\Users\Elakkiya\Downloads\flipkart\oneplus_reviews_final.csv"
df.to_csv(output_file_path, index=False)

print("Corrections made in the Reviews column, and file saved successfully.")


Corrections made in the Reviews column, and file saved successfully.


In [7]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import emoji

# Load the data
file_path = r"C:\Users\Elakkiya\Downloads\flipkart\oppo_merged.csv"
df = pd.read_csv(file_path)

# Handle missing values
# Drop rows where the Reviews text is missing
df.dropna(subset=['Review'], inplace=True)

# Ensure all entries in the Reviews column are strings
df['Review'] = df['Review'].astype(str)

# Remove emojis using the emoji library
def remove_emojis(text):
    return emoji.replace_emoji(text, replace='')

# Remove special characters and digits
def remove_special_characters(text):
    return re.sub('[^a-zA-Z\s]', '', text)

# Normalize the text
def normalize_text(text):
    text = text.lower()  # Convert to lowercase
    text = remove_emojis(text)
    text = remove_special_characters(text)
    text = re.sub('\s+', ' ', text)  # Remove extra spaces
    return text

# Apply normalization
df['Cleaned_Review'] = df['Review'].apply(normalize_text)

# Remove stopwords
stop_words = set(stopwords.words('english'))
df['Cleaned_Review'] = df['Cleaned_Review'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

# Apply lemmatization
lemmatizer = WordNetLemmatizer()
df['Cleaned_Review'] = df['Cleaned_Review'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

# Save the cleaned data to a new CSV file
output_file_path = r"C:\Users\Elakkiya\Downloads\flipkart\oppo_reviews_cleaned.csv"
df.to_csv(output_file_path, index=False)

print("Data cleaning completed and saved to", output_file_path)


Data cleaning completed and saved to C:\Users\Elakkiya\Downloads\flipkart\oppo_reviews_cleaned.csv


In [7]:
df=pd.read_csv(r"C:\Users\Elakkiya\Downloads\flipkart\oppo_reviews_cleaned.csv")
df.isnull().sum()

Product_Name      0
Product_Link      0
Product_Price     0
Review            0
Rating            0
Cleaned_Review    2
dtype: int64

In [8]:
# Display the number of missing values in each column
print("Missing values before dropping:\n", df.isnull().sum())

# Drop rows where 'Cleaned_Review' is missing
df.dropna(subset=['Cleaned_Review'], inplace=True)

# Display the number of missing values after dropping
print("Missing values after dropping:\n", df.isnull().sum())



Missing values before dropping:
 Product_Name      0
Product_Link      0
Product_Price     0
Review            0
Rating            0
Cleaned_Review    2
dtype: int64
Missing values after dropping:
 Product_Name      0
Product_Link      0
Product_Price     0
Review            0
Rating            0
Cleaned_Review    0
dtype: int64


In [15]:
import pandas as pd
from autocorrect import Speller

# Load the data
file_path = r"C:\Users\Elakkiya\Downloads\flipkart\oppo_reviews_cleaned.csv"
df = pd.read_csv(file_path)

# Fill NaN values with an empty string
df['Cleaned_Review'].fillna('', inplace=True)

# Ensure all entries in the Cleaned_Review column are strings
df['Cleaned_Review'] = df['Cleaned_Review'].astype(str)

# Apply autocorrect using autocorrect library
spell = Speller(lang='en')

def correct_spelling(text):
    return ' '.join([spell(word) for word in text.split()])

# Apply spelling correction
df['Autocorrected_Review'] = df['Cleaned_Review'].apply(correct_spelling)

# Save the cleaned data to a new CSV file
output_file_path = r"C:\Users\Elakkiya\Downloads\flipkart\oppo_reviews_autocorrected.csv"
df.to_csv(output_file_path, index=False)

print("Data cleaning and spell correction completed and saved to", output_file_path)


Data cleaning and spell correction completed and saved to C:\Users\Elakkiya\Downloads\flipkart\oppo_reviews_autocorrected.csv


In [16]:
import pandas as pd
import re

# Load the cleaned data
file_path = r"C:\Users\Elakkiya\Downloads\flipkart\oppo_reviews_autocorrected.csv" 
df = pd.read_csv(file_path)

# Create a new column 'Reviews' based on 'Autocorrected_Reviews'
df['Reviews'] = df['Autocorrected_Review']

# Dictionary of specific abbreviations and their corrections
abbreviation_dict = {
    'camara': 'camera','fps': 'frames per second','oplus': 'oneplus','sd gen': 'snapdragon generation', 'superb': 'super',
    'professor': 'processor','sound college': 'sound quality','dam': 'damn','opp': 'oppo','aws': 'awesome',
    'vry gd': 'very good','supppoppp': 'super','prosesar': 'processor','approx hr sot': 'approximately hour screen on time',
    'u': 'you','hr': 'hour','prosper': 'processor','usa': 'usage','good pro': 'good product','gd': 'good',
    'nice prod': 'nice product','p': 'performance','ui': 'user interface','day us': 'day usage','prosesar': 'processor',
    'flipcart': 'flipkart','battery beast': 'battery best','ease': 'easy','pub': 'pubg','osm': 'awesome',
    'worth karma worthuuu': 'worth the money','ois': 'optical image stabilization','aim happy': 'i am happy',
    'pub cod': 'pubg, call of duty','hz': 'hertz','degree census': 'degree celsius','nd': 'and',
    'hit issue': 'heating issue','ai': 'artificial intelligence','wil': 'will','tax': 'thanks', 'anthem': 'item',
    'mint camera': 'main camera','heat throttle': 'heating and throttling','ill update': 'i will update',
    'phone devils aswoome': 'phone device is awesome','hdr': 'high dynamic range','fo': 'for',
    'op ti really': 'oneplus really','one plus ph': 'oneplus phone','spr': 'super','apprehension': 'appreciation',
    'max': 'maximum','oxygen o': 'oxygen os','kinda': 'kind of','medio': 'mediocre','btw': 'by the way','assam': 'awesome',
    'ph': 'phone','bos': 'range boost','beast': 'best','dlr': 'dslr','batter': 'better','cod': 'call of duty','nyc': 'nice',
    'unvilevebale item': 'unbelievable item','supper': 'super','op': 'oneplus','assume': 'awesome',
    'osm dolly atoms': 'awesome dolby atmos','mic failed': 'microphone failed','premium paper': 'premium feel','ver': 'very',
    'gonna': 'going to', 'pub': 'pubg', 'gun': 'good','jus': 'just','ive': 'i have','extent': 'excellent','nice hoon': 'nice phone',
    'gr': 'great','math': 'match','bt': 'but','ok':'okay', 'costa': 'costar', 'beast': 'best', 'hr': 'hour','sha': 'should',
    'wk': 'week', "hadnt": "had not","im": "i am","havent": "have not", "hasnt": "has not", 'u': "you", 'r': "are",
    "ui": "user interface", "doesnt": "does not", 'rambling': "rumbling", "io": "iphone operating system", 
    "sot": "special operation team", "le": "less", "fhd": "full high definition", "dont": "do not",
    "cam": "camera", "came": "camera", "avg": "average","yea": "yeah", "lil": "little", 
    "costlier": "more expensive", "it's": "it is", "very": "very", "n": "and", "cant": "cannot", "dis": "this", "v": "we", 
    "hdr": 'high dynamic range', "didnt": "did not", "ive": "i have", "bezel": "bezel", "ur": "your", "wont": 'will not', 
    "hd": 'high definition', 'cleanui': "clean user interface", "tatics": "haptics", "sd": "secure digital", "gen": "generation", 
    "usp": 'unique selling proposition', 'degc': "degree celsius", "tatic": "haptic", "unbuilt": "inbuilt", 'xiomi': "xiaomi",
    'regreating': "regretting", 'fyi': "for your information", 'issuehrs': "issue hours", 'doomed': "zoomed",
    'ois': "optical image stabilization", 'theyll': "they will", 'ig': "instagram", 'bbd': "bigger better deal", 'cemra': "camera", 
    'fastly': "fast", "optimise": 'optimize', 'osum': 'awesome', 'vi': "vodafone", 'upi': "unified payments interface",
    'eyeturner': "eye turner", 'banger': "banger", 're': "resolution", 'goddamn': "goddamn", 'aint': "am not", 'plesently': "pleasantly", 
    'thik': "think", "tooo": "too", "uisvery": 'user interface very', 'gif': "graphics interchange format", 'siz': "six", 
    'costlier': "costlier", 'iphones': "iphone", 'youre': 'you are', 'doubtbut': "doubt but", 'phome': "phone", 'red': "redmi", 
    'okif': "okay if", 'pic': "picture", 'smatter': 'smarter', 'membrane': "ambrane", "holdnew": 'hold new', 'swine': "swipe", 
    'inshot': "inshot", 'pixelated': "pixel related", 'cameraai': "camera artificial intelligence", 'oppos': 'oppo', 'amaze': "amazing", 
    'daytoday': "day to day", 'offmy': "off my", 'laggy': "lag", 'victus': "victus", 'slowlike': "slow like",
    'wholeday': "full day",'commendable': "recommendable",'baku': "vaku", 
    'beat': "best", 'surfed': "suffered", 'bgmi': "battle ground mobile india",'isnt': "is not", 
    'flickering': "flickering", 'least': "at least", 'doun': "down", 'thats': "that is", 'ill': "i will", 'bout': "about", 
    'butter': "better", 'emi': "equated monthly installment", 'prefect': "perfect", 'ketone': "keyone", 
    'ie': "in other words", 'cuz': "because","youve": "you have",'ott': "over the top", 
    'oct': "october", 'ois': "optical image stabilization", 'ip': "iphone", 'nowhope': "now hope", 'red': "redmi",
    "eraserunblur": "eraser focus", "wifi": "wireless fidelity", 'ok': "okay", 'hiccup': "hiccup",
    'janso': "january so", 'mahmaybe': "milliampere hour may be", 'lovable': "lovable", 'etc': "et cetera","karma": "varma",
    'dayyyyyyyy': "day", 'pic': "picture", 'camra': "camera", 'Vry': "very", 'easilllyy': "easily",
    'came': "camera", 'mob': "mobile", 'flipcard': "flipkart", 'Kkk': "okay",'swimmer':'shimmery'
    # Add more abbreviations and corrections as needed
}

# Replace abbreviations in the Reviews column
for abbrev, full_form in abbreviation_dict.items():
    # Use word boundaries to ensure whole words are replaced
    df['Reviews'] = df['Reviews'].str.replace(r'\b' + abbrev + r'\b', full_form, case=False, regex=True)

# Unwanted words to remove
unwanted_words = ['euuuuuuuuuuuuuu', 'imei','xx','o', 'p', 'k', 'x', 'w', 'st', 'isbn', 'gb', 'min', 'mah', 'x x x', 'makhan','bim', 'rd', 'r',
                  'n', 'nd', 'mm', 'fish', 'apple ka bar', 'mp', 'ppi', 'yrr', 'imei','k', 'science', 'x', 'nd', 'rd', 'le', 'st', 'p', 'pm', 'f',
                  'tho', 'h', 'th', 'gn', 'xr', 'xl', 'am', 'tg', 'p', 'z','ce', 'ip', 'lt', 'td', 'gt', 'lea', 'la', 'der', 'ir', 'j',
                  'sp', 'th', 'v', 'cg', 'wee', 'seg','g','pro'
]

# Remove unwanted words
for word in unwanted_words:
    df['Reviews'] = df['Reviews'].str.replace(r'\b' + word + r'\b', '', case=False, regex=True)

# Save the updated DataFrame to a new CSV file
output_file_path = r"C:\Users\Elakkiya\Downloads\flipkart\oppo_reviews_final.csv"
df.to_csv(output_file_path, index=False)

print("Data cleaned and saved successfully.")


Data cleaned and saved successfully.


In [14]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import emoji

# Load the data
file_path = r"C:\Users\Elakkiya\Downloads\flipkart\Honor_merged.csv"
df = pd.read_csv(file_path)

# Handle missing values
# Drop rows where the Reviews text is missing
df.dropna(subset=['Review'], inplace=True)

# Ensure all entries in the Reviews column are strings
df['Review'] = df['Review'].astype(str)

# Remove emojis using the emoji library
def remove_emojis(text):
    return emoji.replace_emoji(text, replace='')

# Remove special characters and digits
def remove_special_characters(text):
    return re.sub('[^a-zA-Z\s]', '', text)

# Normalize the text
def normalize_text(text):
    text = text.lower()  # Convert to lowercase
    text = remove_emojis(text)
    text = remove_special_characters(text)
    text = re.sub('\s+', ' ', text)  # Remove extra spaces
    return text

# Apply normalization
df['Cleaned_Review'] = df['Review'].apply(normalize_text)

# Remove stopwords
stop_words = set(stopwords.words('english'))
df['Cleaned_Review'] = df['Cleaned_Review'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

# Apply lemmatization
lemmatizer = WordNetLemmatizer()
df['Cleaned_Review'] = df['Cleaned_Review'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

# Save the cleaned data to a new CSV file
output_file_path = r"C:\Users\Elakkiya\Downloads\flipkart\Honor_reviews_cleaned.csv"
df.to_csv(output_file_path, index=False)

print("Data cleaning completed and saved to", output_file_path)


Data cleaning completed and saved to C:\Users\Elakkiya\Downloads\flipkart\Honor_reviews_cleaned.csv


In [18]:
import pandas as pd
from autocorrect import Speller

# Load the data
file_path = r"C:\Users\Elakkiya\Downloads\flipkart\Honor_reviews_cleaned.csv"
df = pd.read_csv(file_path)

# Fill NaN values with an empty string
df['Cleaned_Review'].fillna('', inplace=True)

# Ensure all entries in the Cleaned_Review column are strings
df['Cleaned_Review'] = df['Cleaned_Review'].astype(str)

# Apply autocorrect using autocorrect library
spell = Speller(lang='en')

def correct_spelling(text):
    return ' '.join([spell(word) for word in text.split()])

# Apply spelling correction
df['Autocorrected_Review'] = df['Cleaned_Review'].apply(correct_spelling)

# Save the cleaned data to a new CSV file
output_file_path = r"C:\Users\Elakkiya\Downloads\flipkart\Honor_reviews_autocorrected.csv"
df.to_csv(output_file_path, index=False)

print("Data cleaning and spell correction completed and saved to", output_file_path)


Data cleaning and spell correction completed and saved to C:\Users\Elakkiya\Downloads\flipkart\Honor_reviews_autocorrected.csv


In [1]:
import pandas as pd
import re

# Load the cleaned data
file_path = r"C:\Users\Elakkiya\Downloads\flipkart\Honor_reviews_autocorrected.csv"
df = pd.read_csv(file_path)

# Create a new column 'Reviews' based on 'Autocorrected_Reviews'
df['Reviews'] = df['Autocorrected_Review']

# Dictionary of specific abbreviations and their corrections
abbreviation_dict = {
    'camara': 'camera', 'fps': 'frames per second', 'oplus': 'oneplus', 'sd gen': 'snapdragon generation', 'superb': 'super',
    'professor': 'processor', 'sound college': 'sound quality', 'dam': 'damn', 'opp': 'oppo', 'aws': 'awesome',
    'vry gd': 'very good', 'supppoppp': 'super', 'prosesar': 'processor', 'approx hr sot': 'approximately hour screen on time',
    'u': 'you', 'hr': 'hour', 'prosper': 'processor', 'usa': 'usage', 'good pro': 'good product', 'gd': 'good',
    'nice prod': 'nice product', 'p': 'performance', 'ui': 'user interface', 'day us': 'day usage', 'prosesar': 'processor',
    'flipcart': 'flipkart', 'battery beast': 'battery best', 'ease': 'easy', 'pub': 'pubg', 'osm': 'awesome',
    'worth karma worthuuu': 'worth the money', 'ois': 'optical image stabilization', 'aim happy': 'i am happy',
    'pub cod': 'pubg, call of duty', 'hz': 'hertz', 'degree census': 'degree celsius', 'nd': 'and',
    'hit issue': 'heating issue', 'ai': 'artificial intelligence', 'wil': 'will', 'tax': 'thanks', 'anthem': 'item',
    'mint camera': 'main camera', 'heat throttle': 'heating and throttling', 'ill update': 'i will update',
    'phone devils aswoome': 'phone device is awesome', 'hdr': 'high dynamic range', 'fo': 'for',
    'op ti really': 'oneplus really', 'one plus ph': 'oneplus phone', 'spr': 'super', 'apprehension': 'appreciation',
    'max': 'maximum', 'oxygen o': 'oxygen os', 'kinda': 'kind of', 'medio': 'mediocre', 'btw': 'by the way', 'assam': 'awesome',
    'ph': 'phone', 'bos': 'range boost', 'beast': 'best', 'dlr': 'dslr', 'batter': 'better', 'cod': 'call of duty', 'nyc': 'nice',
    'unvilevebale item': 'unbelievable item', 'supper': 'super', 'op': 'oneplus', 'assume': 'awesome',
    'osm dolly atoms': 'awesome dolby atmos', 'mic failed': 'microphone failed', 'premium paper': 'premium feel', 'ver': 'very',
    'gonna': 'going to', 'pub': 'pubg', 'gun': 'good', 'jus': 'just', 'ive': 'i have', 'extent': 'excellent', 'nice hoon': 'nice phone',
    'gr': 'great', 'math': 'match', 'bt': 'but', 'ok':'okay', 'costa': 'costar', 'beast': 'best', 'hr': 'hour', 'sha': 'should',
    'wk': 'week', "hadnt": "had not", "im": "i am", "havent": "have not", "hasnt": "has not", 'u': "you", 'r': "are",
    "ui": "user interface", "doesnt": "does not", 'rambling': "rumbling", "io": "iphone operating system", 
    "sot": "special operation team", "le": "less", "fhd": "full high definition", "dont": "do not",
    "cam": "camera", "came": "camera", "avg": "average", "yea": "yeah", "lil": "little", 
    "costlier": "more expensive", "it's": "it is", "very": "very", "n": "and", "cant": "cannot", "dis": "this", "v": "we", 
    "hdr": 'high dynamic range', "didnt": "did not", "ive": "i have", "bezel": "bezel", "ur": "your", "wont": 'will not', 
    "hd": 'high definition', 'cleanui': "clean user interface", "tatics": "haptics", "sd": "secure digital", "gen": "generation", 
    "usp": 'unique selling proposition', 'degc': "degree celsius", "tatic": "haptic", "unbuilt": "inbuilt", 'xiomi': "xiaomi",
    'regreating': "regretting", 'fyi': "for your information", 'issuehrs': "issue hours", 'doomed': "zoomed",
    'ois': "optical image stabilization", 'theyll': "they will", 'ig': "instagram", 'bbd': "bigger better deal", 'cemra': "camera", 
    'fastly': "fast", "optimise": 'optimize', 'osum': 'awesome', 'vi': "vodafone", 'upi': "unified payments interface",
    'eyeturner': "eye turner", 'banger': "banger", 're': "resolution", 'goddamn': "goddamn", 'aint': "am not", 'plesently': "pleasantly", 
    'thik': "think", "tooo": "too", "uisvery": 'user interface very', 'gif': "graphics interchange format", 'siz': "six", 
    'costlier': "costlier", 'iphones': "iphone", 'youre': 'you are', 'doubtbut': "doubt but", 'phome': "phone", 'red': "redmi", 
    'okif': "okay if", 'pic': "picture", 'smatter': 'smarter', 'membrane': "ambrane", "holdnew": 'hold new', 'swine': "swipe", 
    'inshot': "inshot", 'pixelated': "pixel related", 'cameraai': "camera artificial intelligence", 'oppos': 'oppo', 'amaze': "amazing", 
    'daytoday': "day to day", 'offmy': "off my", 'laggy': "lag", 'victus': "victus", 'slowlike': "slow like",
    'wholeday': "full day", 'commendable': "recommendable", 'baku': "vaku", 
    'beat': "best", 'surfed': "suffered", 'bgmi': "battle ground mobile india", 'isnt': "is not", 
    'flickering': "flickering", 'least': "at least", 'doun': "down", 'thats': "that is", 'ill': "i will", 'bout': "about", 
    'butter': "better", 'emi': "equated monthly installment", 'prefect': "perfect", 'ketone': "keyone", 
    'ie': "in other words", 'cuz': "because", "youve": "you have", 'ott': "over the top", 
    'oct': "october", 'ois': "optical image stabilization", 'ip': "iphone", 'nowhope': "now hope", 'red': "redmi",
    "eraserunblur": "eraser focus", "wifi": "wireless fidelity", 'ok': "okay", 'hiccup': "hiccup",
    'janso': "january so", 'mahmaybe': "milliampere hour may be", 'lovable': "lovable", 'etc': "et cetera", "karma": "varma",
    'dayyyyyyyy': "day", 'pic': "picture", 'camra': "camera", 'Vry': "very", 'easilllyy': "easily",
    'came': "camera", 'mob': "mobile", 'flipcard': "flipkart", 'Kkk': "okay", 'swimmer': 'shimmery', 'supeeeeer': 'super', 'ur': 'your', 'nais': 'nice', 'descent': 'decent', 'se user': 'super', 'nris': 'nice',
    'suuuuuuuuuuuuuuper': 'super', 'project': 'product', 'xtra': 'extra', 'userniterface': 'user interface', 'prossecer': 'processor', 'amazingest': 'amazing',
    'col': 'color', 'superbbbbb': 'superb', 'goog': 'good', 'censor': 'sensor', 'ram': 'ram', 'wwwhhhoooooooo': 'who', 'pictre': 'picture', 'comera': 'camera', 'fa': 'for',
    'truely': 'truly', 'fonnne': 'phone', 'excellant': 'excellent', 'totallly': 'totally', 'ceramicback': 'ceramic back', 'internat': 'internet', 'iui': 'miui', 'camaera': 'camera', 'supper': 'super',
    'exllent': 'excellent', 'magnafiqui': 'magnificent', 'affecfordble': 'affordable', 'smmmmooooothhhhhh': 'smooth', 'veryg': 'very good', 'camaera': 'camera',
    'ar': 'are', 'eet': 'get', 'june': 'june', 'ster': 'star', 'osam': 'awesome', 'gona': 'gonna', 'suchh': 'such', 'amoled': 'amoled', 'fon': 'phone', 'ram': 'ram', 'exp': 'expensive',
    'xclent': 'excellent', 'ans': 'and', 'shotcut': 'shortcut', 'expoerience': 'experience', 'bgmi': 'battle grounds mobile india', 'anr': 'and', 'whao': 'wow', 'awesomeeee': 'awesome', 'blurr': 'blur',
    'definately': 'definitely', 'suuuuper': 'super', 'nize': 'nice', 'phobe': 'phone', 'thanksflipcart': 'thanks flipkart', 'mimax': 'mi max', 'picure': 'picture', 'blutooth': 'bluetooth', 'qhy': 'why',
    'user interfce': 'user interface', 'beasutiful': 'beautiful', 'smother': 'smoother', 'gooddd': 'good', 'ramgood': 'ram good', 'smother': 'smoother', 'suggestable': 'suggestible', 'gamesro': 'games to',
    'playbles': 'playable', 'fes': 'fest', 'crystalclear': 'crystal clear', 'ph': 'phone', 'frimeware': 'firmware', 'dowload': 'download', 'camara': 'camera', 'gud': 'good', 'jio': 'jio', 'againflipkart': 'again flipkart',
    'imposible': 'impossible', 'comfortbale': 'comfortable', 'whith': 'with', 'chages': 'changes', 'consiser': 'consider', 'heatsup': 'heats up', 'grt': 'great', 'fringerprint': 'fingerprint', 'detailingg': 'detailing',
    'gud': 'good', 'hz': 'hertz', 'nd': 'and', 'soundcollage': 'sound quality', 'wi': 'with', 'appreciateble': 'appreciable', 'phn': 'phone', 'mu': 'music', 'preforance': 'performance', 'veryyy': 'very',
    'ceramicback': 'ceramic back', 'ceraicback': 'ceramic back', 'battrey': 'battery', 'hollywow': 'hollywood', 'expirence': 'experience', 'niceui': 'nice user interface', 'terriffic': 'terrific', 'varien': 'variant',
    'xcellent': 'excellent', 'sorring': 'scoring', 'comfrtable': 'comfortable', 'mics': 'microphones', 'cds': 'cd', 'whot': 'what', 'mindbloing': 'mind blowing', 'gamesome': 'games on', 'fluently': 'fluent',
    'dat': 'that', 'amasing': 'amazing', 'superrr': 'super', 'osm': 'awesome', 'hreat': 'great', 'costefficeint': 'cost efficient', 'bhaskar': 'bhaskaran', 'heatingproblem': 'heating problem',
    'miracast': 'miracast', 'definately': 'definitely', 'wo': 'would', 'thsnks': 'thanks', 'whywhy': 'why why', 'cam': 'camera', 'gaurantee': 'guarantee', 'crazychaging': 'crazy charging',
    'interfece': 'interface', 'excellant': 'excellent', 'intenation': 'internet', 'bettry': 'battery', 'exellent': 'excellent', 'lockscreen': 'lock screen', 'tap': 'tap', 'eb': 'is',
    'cam': 'camera', 'exellent': 'excellent', 'pros': 'pros', 'cus': 'because', 'definatly': 'definitely', 'microphoneon': 'microphone on', 'renaining': 'remaining', 'nxet': 'next',
    'switcher': 'switcher', 'yooo': 'yo', 'exlent': 'excellent', 'vol': 'volume', 'magic': 'magic', 'beacuse': 'because', 'ven': 'even', 'expence': 'expense', 'pones': 'phones',
    'pu': 'pubg', 'exlent': 'excellent', 'bgmi': 'battle ground mobile india', 'athir': 'arshad', 'ramaining': 'remaining', 'docing': 'docking', 'batterylife': 'battery life', 'batt': 'battery', 
    'cas': 'case', 'proble': 'problem', 'sa': 'sa', 'lense': 'lens', 'cameraof': 'camera of', 'parformance': 'performance', 'whatsapp': 'whatsapp', 'camra': 'camera', 'amoleddisplay': 'amoled display',
    'hugh': 'high', 'exellent': 'excellent', 'phon': 'phone', 'boaring': 'boring', 'sync': 'sync', 'flipkart': 'flipkart', 'tp': 'to', 'displayflipkart': 'display flipkart', 'flipcart': 'flipkart',
    'complain': 'complain', 'sdgen': 'snapdragon generation', 'comerabgmi': 'camera battle ground mobile india', 'doint': 'do not', 'iys': 'it is', 'superp': 'super', 'expeeience': 'experience', 'loook': 'look',
    'couple': 'couple', 'phoneflipkart': 'phone flipkart', 'got': 'got', 'consisency': 'consistency', 'dispkay': 'display', 'proc': 'processor', 'gud': 'good', 'clarityclearity': 'clarity', 'octo': 'octa',
    'excessive': 'excessive', 'goodem': 'good em', 'snapdragon': 'snapdragon', 'camara': 'camera', 'sper': 'super', 'rebooting': 'rebooting', 'accessorry': 'accessory', 'usr': 'user', 'dispkay': 'display',
    'pictureclarity': 'picture clarity', 'dispkayclarity': 'display clarity', 'extremely': 'extremely', 'awsome': 'awesome', 'goodcamera': 'good camera', 'goodclatrity': 'good clarity', 'miui': 'miui',
    'clearn': 'clear', 'cllick': 'click', 'reseting': 'resetting', 'psl': 'psl', 'edxtra': 'extra', 'redmy': 'redmi', 'gamy': 'game', 'biometrics': 'biometrics', 'noney': 'money', 'lenovo': 'lenovo',
    'possessing': 'processing', 'ossam': 'awesome', 'tis': 'this', 'surport': 'support', 'awsom': 'awesome', 'supere': 'super', 'serviceflipkart': 'service flipkart', 'awesomeas': 'awesome', 'greate': 'great',
    'mobs': 'mobile', 'sept': 'september', 'its': 'its', 'wasawsm': 'was awesome', 'flipkartgood': 'flipkart good', 'easeofuse': 'ease of use', 'mi': 'mi', 'perfectperfect': 'perfect', 'ohow': 'oh how',
    'dosn': 'does', 'okkay': 'okay', 'awdome': 'awesome', 'des': 'des', 'awesomefeature': 'awesome feature', 'circa': 'circa', 'gud': 'good', 'backca': 'back cover', 'yhe': 'the', 'niceproduct': 'nice product',
    'me':'mi', 'awesomegod': 'awesome', 'tricking': 'tricking', 'allure': 'allure', 'mia1': 'mi a1', 'backpanel': 'back panel', 'microwave': 'microphone', 'fri': 'friend', 'frusterating': 'frustrating',
    'rs': 'rs', 'chat': 'chat', 'saiz': 'size', 'ram': 'ram', 'workswll': 'works well', 'mid': 'mi', 'ph': 'phone', 'flpkrt': 'flipkart', 'blowming': 'blowing', 'complaince': 'complain', 'ar': 'are', 'phn': 'phone',
    'usb': 'usb', 'coverback': 'cover back', 'awesomecamera': 'awesome camera', 'osame': 'awesome', 'bat': 'battery', 'weighing': 'weighing', 'midrange': 'mid range', 'blowng': 'blowing', 'yoursite': 'your site',
    'superbg': 'superb', 'intetnet': 'internet', 'besutiful': 'beautiful', 'suuuper': 'super', 'ate': 'ate', 'ceramicback': 'ceramic back', 'velvet': 'velvet', 'goog': 'good', 'doeswork': 'does work', 'interrface': 'interface',
    'spellings': 'spellings', 'competetion': 'competition', 'clarityis': 'clarity is', 'xclent': 'excellent', 'crome': 'chrome', 'sinply': 'simply', 'cermaic': 'ceramic', 'plz': 'please', 'compability': 'compatibility',
    'excellnt': 'excellent', 'iui': 'miui', 'ultraa': 'ultra', 'whatsoever': 'whatsoever', 'goodand': 'good and', 'gotcamera': 'got camera', 'ple': 'please', 'afortable': 'affordable', 'kns': 'kms', 'nicefeature': 'nice feature',
    'redemi': 'redmi', 'xxent': 'excellent', 'cuttrt': 'cutout', 'mi': 'mi', 'serires': 'series', 'whish': 'wish', 'dlexo': 'deluxe', 'cristal': 'crystal', 'makesense': 'make sense', 'gud': 'good', 'expellience': 'experience',
    'reflectiion': 'reflection', 'microphone': 'microphone', 'cameraquality': 'camera quality', 'awesomef': 'awesome', 'guddisplay': 'good display', 'frustrting': 'frustrating', 'vol': 'volume', 'cermaic': 'ceramic', 'clearence': 'clearance', 
    'snapdragon': 'snapdragon', 'ceramicback': 'ceramic back', 'cermaicback': 'ceramic back', 'snapdragon820': 'snapdragon 820', 'miui8': 'miui 8', 'snapdragon': 'snapdragon', 'snapdragon': 'snapdragon', 'snapdragon835': 'snapdragon 835', 'snapdragon820': 'snapdragon 820'
}
# Replace abbreviations in the Reviews column
for abbrev, full_form in abbreviation_dict.items():
    # Use word boundaries to ensure whole words are replaced
    df['Reviews'] = df['Reviews'].str.replace(r'\b' + abbrev + r'\b', full_form, case=False, regex=True)

unwanted_words = [
    'euuuuuuuuuuuuuu', 'imei', 'xx', 'o', 'p', 'k', 'x', 'w', 'st', 'isbn', 'gb', 'min', 'mah', 'x x x', 'makhan',
    'bim', 'rd', 'r', 'n', 'nd', 'mm', 'fish', 'apple ka bar', 'mp', 'ppi', 'yrr', 'imei', 'k', 'science', 'x', 'nd',
    'rd', 'le', 'st', 'p', 'pm', 'f', 'tho', 'h', 'th', 'gn', 'xr', 'xl', 'am', 'tg', 'p', 'z', 'ce', 'ip', 'lt', 'td',
    'gt', 'lea', 'la', 'der', 'ir', 'j', 'sp', 'th', 'v', 'cg', 'wee', 'seg', 'g', 'pro', 'x', 'p', 'e', 'opt', 'nfc',
    'tm', 'ir', 'xnd', 'ffd', 'htc', 'cal', 'pm', 'ordo', 'gh', 'rn', 'mia', 'ip', 'sd', 'oct', 'core', 'expos', 'rd',
    'ip', 'pe', 'amp', 'paisa bassol', 'pc', 'al', 'zindabad', 'dnd', 'rsk', 'g', 'ooooooo', 'mmmmmmm', 'ggggggg', 'vfd',
    'havoc', 'ah', 'hi', 'hai', 'swimming', 'gop', 'ji', 'lajawab', 'asap loose', 'ie', 'h ago', 'madathukulam', 'mo', 
    'fea', 'l', 'ex', 'qu', 'Lotus', 'voodoo', 'ko', 'ont', 'maa', 'ag', 'ne', 'quot', 'sakuntala', 'rayagada bissamcuttack',
    'gimmick', 'thi', 'str', 'fatafati', 'mast', 'garcia jharkhand', 'sa', 'isvudn', 'x', 'sg', 'hijab', 'tbh', 'wars', 
    'nts', 'huawei', 'keirin', 'ofc', 'pg', 'sec', 'jaar', 'hai ye', 'th', 'ft', 'emu', 'gpu', 'fr', 'nit', 'eis', 'oo',
    'hh', 'ly', 'fei', 'q', 'c', 'sw', 'kd', 'ofc', 'bl', 'lol', 'et', 'rgh', 'extremewwe', 'dts', 'iu', 'vr', 'Soo', 
    'al', 'sm', 'rsk', 'eta', 'gh', 'nu', 'ota', 'fp'
]
# Remove unwanted words
for word in unwanted_words:
    df['Reviews'] = df['Reviews'].str.replace(r'\b' + word + r'\b', '', case=False, regex=True)

# Save the updated DataFrame to a new CSV file
output_file_path = r"C:\Users\Elakkiya\Downloads\flipkart\Honor_reviews_final.csv"
df.to_csv(output_file_path, index=False)

print("Data cleaned and saved successfully.")

    


Data cleaned and saved successfully.


In [None]:
'supeeeeer':'super','ur':'your','nais':'nice','descent':'decent','se user':'super','nris':'nice',
'suuuuuuuuuuuuuuper':'super','project':'product','xtra':'extra','userniterface':'user interface','prossecer':'processor''amazingest':'amazing',
'col':'because','youre':'your','nov':'november','http':'hitup','hitting':'heating','sam':'awesome','nic':'nice','allk' : 'all'

x,sg,hijab,tbh,wars,nts,huawei,keirin,ofc,pg,sec,jaar,hai ye,th,ft,emu,gpu,fr,nit,'eis', 'oo', 'hh, ly, fei, q, c, sw, kd, ofc,bl, lol,et, rgh,
extremewwe: extreme, dts, iu, vr,Soo: so, 
al, sm, rsk, eta, gh, nu, ota,fp


In [52]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import emoji

# Load the data
file_path = r"C:\Users\Elakkiya\Downloads\flipkart\motorola_merged.csv"
df = pd.read_csv(file_path)

# Handle missing values
# Drop rows where the Reviews text is missing
df.dropna(subset=['Review'], inplace=True)

# Ensure all entries in the Reviews column are strings
df['Review'] = df['Review'].astype(str)

# Remove emojis using the emoji library
def remove_emojis(text):
    return emoji.replace_emoji(text, replace='')

# Remove special characters and digits
def remove_special_characters(text):
    return re.sub('[^a-zA-Z\s]', '', text)

# Normalize the text
def normalize_text(text):
    text = text.lower()  # Convert to lowercase
    text = remove_emojis(text)
    text = remove_special_characters(text)
    text = re.sub('\s+', ' ', text)  # Remove extra spaces
    return text

# Apply normalization
df['Cleaned_Review'] = df['Review'].apply(normalize_text)

# Remove stopwords
stop_words = set(stopwords.words('english'))
df['Cleaned_Review'] = df['Cleaned_Review'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

# Apply lemmatization
lemmatizer = WordNetLemmatizer()
df['Cleaned_Review'] = df['Cleaned_Review'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

# Save the cleaned data to a new CSV file
output_file_path = r"C:\Users\Elakkiya\Downloads\flipkart\motorola_reviews_cleaned.csv"
df.to_csv(output_file_path, index=False)

print("Data cleaning completed and saved to", output_file_path)


Data cleaning completed and saved to C:\Users\Elakkiya\Downloads\flipkart\motorola_reviews_cleaned.csv


In [53]:
import pandas as pd
from autocorrect import Speller

# Load the data
file_path = r"C:\Users\Elakkiya\Downloads\flipkart\motorola_reviews_cleaned.csv"
df = pd.read_csv(file_path)

# Fill NaN values with an empty string
df['Cleaned_Review'].fillna('', inplace=True)

# Ensure all entries in the Cleaned_Review column are strings
df['Cleaned_Review'] = df['Cleaned_Review'].astype(str)

# Apply autocorrect using autocorrect library
spell = Speller(lang='en')

def correct_spelling(text):
    return ' '.join([spell(word) for word in text.split()])

# Apply spelling correction
df['Autocorrected_Review'] = df['Cleaned_Review'].apply(correct_spelling)

# Save the cleaned data to a new CSV file
output_file_path = r"C:\Users\Elakkiya\Downloads\flipkart\motorola_reviews_autocorrected.csv"
df.to_csv(output_file_path, index=False)

print("Data cleaning and spell correction completed and saved to", output_file_path)


Data cleaning and spell correction completed and saved to C:\Users\Elakkiya\Downloads\flipkart\motorola_reviews_autocorrected.csv


In [4]:
import pandas as pd
import re

# Load the cleaned data
file_path = r"C:\Users\Elakkiya\Downloads\flipkart\motorola_reviews_autocorrected.csv"
df = pd.read_csv(file_path)

# Create a new column 'Reviews' based on 'Autocorrected_Reviews'
df['Reviews'] = df['Autocorrected_Review']

# Dictionary of specific abbreviations and their corrections
abbreviation_dict = {
    'camara': 'camera', 'fps': 'frames per second', 'oplus': 'oneplus', 'sd gen': 'snapdragon generation', 'superb': 'super',
    'professor': 'processor', 'sound college': 'sound quality', 'dam': 'damn', 'opp': 'oppo', 'aws': 'awesome',
    'vry gd': 'very good', 'supppoppp': 'super', 'prosesar': 'processor', 'approx hr sot': 'approximately hour screen on time',
    'u': 'you', 'hr': 'hour', 'prosper': 'processor', 'usa': 'usage', 'good pro': 'good product', 'gd': 'good',
    'nice prod': 'nice product', 'p': 'performance', 'ui': 'user interface', 'day us': 'day usage', 'prosesar': 'processor',
    'flipcart': 'flipkart', 'battery beast': 'battery best', 'ease': 'easy', 'pub': 'pubg', 'osm': 'awesome',
    'worth karma worthuuu': 'worth the money', 'ois': 'optical image stabilization', 'aim happy': 'i am happy',
    'pub cod': 'pubg, call of duty', 'hz': 'hertz', 'degree census': 'degree celsius', 'nd': 'and',
    'hit issue': 'heating issue', 'ai': 'artificial intelligence', 'wil': 'will', 'tax': 'thanks', 'anthem': 'item',
    'mint camera': 'main camera', 'heat throttle': 'heating and throttling', 'ill update': 'i will update',
    'phone devils aswoome': 'phone device is awesome', 'hdr': 'high dynamic range', 'fo': 'for',
    'op ti really': 'oneplus really', 'one plus ph': 'oneplus phone', 'spr': 'super', 'apprehension': 'appreciation',
    'max': 'maximum', 'oxygen o': 'oxygen os', 'kinda': 'kind of', 'medio': 'mediocre', 'btw': 'by the way', 'assam': 'awesome',
    'ph': 'phone', 'bos': 'range boost', 'beast': 'best', 'dlr': 'dslr', 'batter': 'better', 'cod': 'call of duty', 'nyc': 'nice',
    'unvilevebale item': 'unbelievable item', 'supper': 'super', 'op': 'oneplus', 'assume': 'awesome',
    'osm dolly atoms': 'awesome dolby atmos', 'mic failed': 'microphone failed', 'premium paper': 'premium feel', 'ver': 'very',
    'gonna': 'going to', 'pub': 'pubg', 'gun': 'good', 'jus': 'just', 'ive': 'i have', 'extent': 'excellent', 'nice hoon': 'nice phone',
    'gr': 'great', 'math': 'match', 'bt': 'but', 'ok':'okay', 'costa': 'costar', 'beast': 'best', 'hr': 'hour', 'sha': 'should',
    'wk': 'week', "hadnt": "had not", "im": "i am", "havent": "have not", "hasnt": "has not", 'u': "you", 'r': "are",
    "ui": "user interface", "doesnt": "does not", 'rambling': "rumbling", "io": "iphone operating system", 
    "sot": "special operation team", "le": "less", "fhd": "full high definition", "dont": "do not",
    "cam": "camera", "came": "camera", "avg": "average", "yea": "yeah", "lil": "little", 
    "costlier": "more expensive", "it's": "it is", "very": "very", "n": "and", "cant": "cannot", "dis": "this", "v": "we", 
    "hdr": 'high dynamic range', "didnt": "did not", "ive": "i have", "bezel": "bezel", "ur": "your", "wont": 'will not', 
    "hd": 'high definition', 'cleanui': "clean user interface", "tatics": "haptics", "sd": "secure digital", "gen": "generation", 
    "usp": 'unique selling proposition', 'degc': "degree celsius", "tatic": "haptic", "unbuilt": "inbuilt", 'xiomi': "xiaomi",
    'regreating': "regretting", 'fyi': "for your information", 'issuehrs': "issue hours", 'doomed': "zoomed",
    'ois': "optical image stabilization", 'theyll': "they will", 'ig': "instagram", 'bbd': "bigger better deal", 'cemra': "camera", 
    'fastly': "fast", "optimise": 'optimize', 'osum': 'awesome', 'vi': "vodafone", 'upi': "unified payments interface",
    'eyeturner': "eye turner", 'banger': "banger", 're': "resolution", 'goddamn': "goddamn", 'aint': "am not", 'plesently': "pleasantly", 
    'thik': "think", "tooo": "too", "uisvery": 'user interface very', 'gif': "graphics interchange format", 'siz': "six", 
    'costlier': "costlier", 'iphones': "iphone", 'youre': 'you are', 'doubtbut': "doubt but", 'phome': "phone", 'red': "redmi", 
    'okif': "okay if", 'pic': "picture", 'smatter': 'smarter', 'membrane': "ambrane", "holdnew": 'hold new', 'swine': "swipe", 
    'inshot': "inshot", 'pixelated': "pixel related", 'cameraai': "camera artificial intelligence", 'oppos': 'oppo', 'amaze': "amazing", 
    'daytoday': "day to day", 'offmy': "off my", 'laggy': "lag", 'victus': "victus", 'slowlike': "slow like",
    'wholeday': "full day", 'commendable': "recommendable", 'baku': "vaku", 
    'beat': "best", 'surfed': "suffered", 'bgmi': "battle ground mobile india", 'isnt': "is not", 
    'flickering': "flickering", 'least': "at least", 'doun': "down", 'thats': "that is", 'ill': "i will", 'bout': "about", 
    'butter': "better", 'emi': "equated monthly installment", 'prefect': "perfect", 'ketone': "keyone", 
    'ie': "in other words", 'cuz': "because", "youve": "you have", 'ott': "over the top", 
    'oct': "october", 'ois': "optical image stabilization", 'ip': "iphone", 'nowhope': "now hope", 'red': "redmi",
    "eraserunblur": "eraser focus", "wifi": "wireless fidelity", 'ok': "okay", 'hiccup': "hiccup",
    'janso': "january so", 'mahmaybe': "milliampere hour may be", 'lovable': "lovable", 'etc': "et cetera", "karma": "varma",
    'dayyyyyyyy': "day", 'pic': "picture", 'camra': "camera", 'Vry': "very", 'easilllyy': "easily",
    'came': "camera", 'mob': "mobile", 'flipcard': "flipkart", 'Kkk': "okay", 'swimmer': 'shimmery', 'supeeeeer': 'super', 'ur': 'your', 'nais': 'nice', 'descent': 'decent', 'se user': 'super', 'nris': 'nice',
    'suuuuuuuuuuuuuuper': 'super', 'project': 'product', 'xtra': 'extra', 'userniterface': 'user interface', 'prossecer': 'processor', 'amazingest': 'amazing',
    'col': 'color', 'superbbbbb': 'superb', 'goog': 'good', 'censor': 'sensor', 'ram': 'ram', 'wwwhhhoooooooo': 'who', 'pictre': 'picture', 'comera': 'camera', 'fa': 'for',
    'truely': 'truly', 'fonnne': 'phone', 'excellant': 'excellent', 'totallly': 'totally', 'ceramicback': 'ceramic back', 'internat': 'internet', 'iui': 'miui', 'camaera': 'camera', 'supper': 'super',
    'exllent': 'excellent', 'magnafiqui': 'magnificent', 'affecfordble': 'affordable', 'smmmmooooothhhhhh': 'smooth', 'veryg': 'very good', 'camaera': 'camera',
    'ar': 'are', 'eet': 'get', 'june': 'june', 'ster': 'star', 'osam': 'awesome', 'gona': 'gonna', 'suchh': 'such', 'amoled': 'amoled', 'fon': 'phone', 'ram': 'ram', 'exp': 'expensive',
    'xclent': 'excellent', 'ans': 'and', 'shotcut': 'shortcut', 'expoerience': 'experience', 'bgmi': 'battle grounds mobile india', 'anr': 'and', 'whao': 'wow', 'awesomeeee': 'awesome', 'blurr': 'blur',
    'definately': 'definitely', 'suuuuper': 'super', 'nize': 'nice', 'phobe': 'phone', 'thanksflipcart': 'thanks flipkart', 'mimax': 'mi max', 'picure': 'picture', 'blutooth': 'bluetooth', 'qhy': 'why',
    'user interfce': 'user interface', 'beasutiful': 'beautiful', 'smother': 'smoother', 'gooddd': 'good', 'ramgood': 'ram good', 'smother': 'smoother', 'suggestable': 'suggestible', 'gamesro': 'games to',
    'playbles': 'playable', 'fes': 'fest', 'crystalclear': 'crystal clear', 'ph': 'phone', 'frimeware': 'firmware', 'dowload': 'download', 'camara': 'camera', 'gud': 'good', 'jio': 'jio', 'againflipkart': 'again flipkart',
    'imposible': 'impossible', 'comfortbale': 'comfortable', 'whith': 'with', 'chages': 'changes', 'consiser': 'consider', 'heatsup': 'heats up', 'grt': 'great', 'fringerprint': 'fingerprint', 'detailingg': 'detailing',
    'gud': 'good', 'hz': 'hertz', 'nd': 'and', 'soundcollage': 'sound quality', 'wi': 'with', 'appreciateble': 'appreciable', 'phn': 'phone', 'mu': 'music', 'preforance': 'performance', 'veryyy': 'very',
    'ceramicback': 'ceramic back', 'ceraicback': 'ceramic back', 'battrey': 'battery', 'hollywow': 'hollywood', 'expirence': 'experience', 'niceui': 'nice user interface', 'terriffic': 'terrific', 'varien': 'variant',
    'xcellent': 'excellent', 'sorring': 'scoring', 'comfrtable': 'comfortable', 'mics': 'microphones', 'cds': 'cd', 'whot': 'what', 'mindbloing': 'mind blowing', 'gamesome': 'games on', 'fluently': 'fluent',
    'dat': 'that', 'amasing': 'amazing', 'superrr': 'super', 'osm': 'awesome', 'hreat': 'great', 'costefficeint': 'cost efficient', 'bhaskar': 'bhaskaran', 'heatingproblem': 'heating problem',
    'miracast': 'miracast', 'definately': 'definitely', 'wo': 'would', 'thsnks': 'thanks', 'whywhy': 'why why', 'cam': 'camera', 'gaurantee': 'guarantee', 'crazychaging': 'crazy charging',
    'interfece': 'interface', 'excellant': 'excellent', 'intenation': 'internet', 'bettry': 'battery', 'exellent': 'excellent', 'lockscreen': 'lock screen', 'tap': 'tap', 'eb': 'is',
    'cam': 'camera', 'exellent': 'excellent', 'pros': 'pros', 'cus': 'because', 'definatly': 'definitely', 'microphoneon': 'microphone on', 'renaining': 'remaining', 'nxet': 'next',
    'switcher': 'switcher', 'yooo': 'yo', 'exlent': 'excellent', 'vol': 'volume', 'magic': 'magic', 'beacuse': 'because', 'ven': 'even', 'expence': 'expense', 'pones': 'phones',
    'pu': 'pubg', 'exlent': 'excellent', 'bgmi': 'battle ground mobile india', 'athir': 'arshad', 'ramaining': 'remaining', 'docing': 'docking', 'batterylife': 'battery life', 'batt': 'battery', 
    'cas': 'case', 'proble': 'problem', 'sa': 'sa', 'lense': 'lens', 'cameraof': 'camera of', 'parformance': 'performance', 'whatsapp': 'whatsapp', 'camra': 'camera', 'amoleddisplay': 'amoled display',
    'hugh': 'high', 'exellent': 'excellent', 'phon': 'phone', 'boaring': 'boring', 'sync': 'sync', 'flipkart': 'flipkart', 'tp': 'to', 'displayflipkart': 'display flipkart', 'flipcart': 'flipkart',
    'complain': 'complain', 'sdgen': 'snapdragon generation', 'comerabgmi': 'camera battle ground mobile india', 'doint': 'do not', 'iys': 'it is', 'superp': 'super', 'expeeience': 'experience', 'loook': 'look',
    'couple': 'couple', 'phoneflipkart': 'phone flipkart', 'got': 'got', 'consisency': 'consistency', 'dispkay': 'display', 'proc': 'processor', 'gud': 'good', 'clarityclearity': 'clarity', 'octo': 'octa',
    'excessive': 'excessive', 'goodem': 'good em', 'snapdragon': 'snapdragon', 'camara': 'camera', 'sper': 'super', 'rebooting': 'rebooting', 'accessorry': 'accessory', 'usr': 'user', 'dispkay': 'display',
    'pictureclarity': 'picture clarity', 'dispkayclarity': 'display clarity', 'extremely': 'extremely', 'awsome': 'awesome', 'goodcamera': 'good camera', 'goodclatrity': 'good clarity', 'miui': 'miui',
    'clearn': 'clear', 'cllick': 'click', 'reseting': 'resetting', 'psl': 'psl', 'edxtra': 'extra', 'redmy': 'redmi', 'gamy': 'game', 'biometrics': 'biometrics', 'noney': 'money', 'lenovo': 'lenovo',
    'possessing': 'processing', 'ossam': 'awesome', 'tis': 'this', 'surport': 'support', 'awsom': 'awesome', 'supere': 'super', 'serviceflipkart': 'service flipkart', 'awesomeas': 'awesome', 'greate': 'great',
    'mobs': 'mobile', 'sept': 'september', 'its': 'its', 'wasawsm': 'was awesome', 'flipkartgood': 'flipkart good', 'easeofuse': 'ease of use', 'mi': 'mi', 'perfectperfect': 'perfect', 'ohow': 'oh how',
    'dosn': 'does', 'okkay': 'okay', 'awdome': 'awesome', 'des': 'des', 'awesomefeature': 'awesome feature', 'circa': 'circa', 'gud': 'good', 'backca': 'back cover', 'yhe': 'the', 'niceproduct': 'nice product',
    'me':'mi', 'awesomegod': 'awesome', 'tricking': 'tricking', 'allure': 'allure', 'mia1': 'mi a1', 'backpanel': 'back panel', 'microwave': 'microphone', 'fri': 'friend', 'frusterating': 'frustrating','moto':'motorola',
    'rs': 'rs', 'chat': 'chat', 'saiz': 'size', 'ram': 'ram', 'workswll': 'works well', 'mid': 'mi', 'ph': 'phone', 'flpkrt': 'flipkart', 'blowming': 'blowing', 'complaince': 'complain', 'ar': 'are', 'phn': 'phone',
    'usb': 'usb', 'coverback': 'cover back', 'awesomecamera': 'awesome camera', 'osame': 'awesome', 'bat': 'battery', 'weighing': 'weighing', 'midrange': 'mid range', 'blowng': 'blowing', 'yoursite': 'your site',
    'superbg': 'superb', 'intetnet': 'internet', 'besutiful': 'beautiful', 'suuuper': 'super', 'ate': 'ate', 'ceramicback': 'ceramic back', 'velvet': 'velvet', 'goog': 'good', 'doeswork': 'does work', 'interrface': 'interface',
    'spellings': 'spellings', 'competetion': 'competition', 'clarityis': 'clarity is', 'xclent': 'excellent', 'crome': 'chrome', 'sinply': 'simply', 'cermaic': 'ceramic', 'plz': 'please', 'compability': 'compatibility',
    'excellnt': 'excellent', 'iui': 'miui', 'ultraa': 'ultra', 'whatsoever': 'whatsoever', 'goodand': 'good and', 'gotcamera': 'got camera', 'ple': 'please', 'afortable': 'affordable', 'kns': 'kms', 'nicefeature': 'nice feature',
    'redemi': 'redmi', 'xxent': 'excellent', 'cuttrt': 'cutout', 'mi': 'mi', 'serires': 'series', 'whish': 'wish', 'dlexo': 'deluxe', 'cristal': 'crystal', 'makesense': 'make sense', 'gud': 'good', 'expellience': 'experience',
    'reflectiion': 'reflection', 'microphone': 'microphone', 'cameraquality': 'camera quality', 'awesomef': 'awesome', 'guddisplay': 'good display', 'frustrting': 'frustrating', 'vol': 'volume', 'cermaic': 'ceramic', 'clearence': 'clearance', 
    'snapdragon': 'snapdragon', 'ceramicback': 'ceramic back', 'cermaicback': 'ceramic back', 'snapdragon820': 'snapdragon 820', 'miui8': 'miui 8', 'snapdragon': 'snapdragon', 'snapdragon': 'snapdragon', 'snapdragon835': 'snapdragon 835', 'snapdragon820': 'snapdragon 820'
}
# Replace abbreviations in the Reviews column
for abbrev, full_form in abbreviation_dict.items():
    # Use word boundaries to ensure whole words are replaced
    df['Reviews'] = df['Reviews'].str.replace(r'\b' + abbrev + r'\b', full_form, case=False, regex=True)

unwanted_words = [
    'euuuuuuuuuuuuuu', 'imei', 'xx', 'o', 'p', 'k', 'x', 'w', 'st', 'isbn', 'gb', 'min', 'mah', 'x x x', 'makhan',
    'bim', 'rd', 'r', 'n', 'nd', 'mm', 'fish', 'apple ka bar', 'mp', 'ppi', 'yrr', 'imei', 'k', 'science', 'x', 'nd',
    'rd', 'le', 'st', 'p', 'pm', 'f', 'tho', 'h', 'th', 'gn', 'xr', 'xl', 'am', 'tg', 'p', 'z', 'ce', 'ip', 'lt', 'td',
    'gt', 'lea', 'la', 'der', 'ir', 'j', 'sp', 'th', 'v', 'cg', 'wee', 'seg', 'g', 'pro', 'x', 'p', 'e', 'opt', 'nfc',
    'tm', 'ir', 'xnd', 'ffd', 'htc', 'cal', 'pm', 'ordo', 'gh', 'rn', 'mia', 'ip', 'sd', 'oct', 'core', 'expos', 'rd',
    'ip', 'pe', 'amp', 'paisa bassol', 'pc', 'al', 'zindabad', 'dnd', 'rsk', 'g', 'ooooooo', 'mmmmmmm', 'ggggggg', 'vfd',
    'havoc', 'ah', 'hi', 'hai', 'swimming', 'gop', 'ji', 'lajawab', 'asap loose', 'ie', 'h ago', 'madathukulam', 'mo', 
    'fea', 'l', 'ex', 'qu', 'Lotus', 'voodoo', 'ko', 'ont', 'maa', 'ag', 'ne', 'quot', 'sakuntala', 'rayagada bissamcuttack',
    'gimmick', 'thi', 'str', 'fatafati', 'mast', 'garcia jharkhand', 'sa', 'isvudn', 'x', 'sg', 'hijab', 'tbh', 'wars', 
    'nts', 'huawei', 'keirin', 'ofc', 'pg', 'sec', 'jaar', 'hai ye', 'th', 'ft', 'emu', 'gpu', 'fr', 'nit', 'eis', 'oo',
    'hh', 'ly', 'fei', 'q', 'c', 'sw', 'kd', 'ofc', 'bl', 'lol', 'et', 'rgh', 'extremewwe', 'dts', 'iu', 'vr', 'Soo', 
    'al', 'sm', 'rsk', 'eta', 'gh', 'nu', 'ota', 'fp'
]
# Remove unwanted words
for word in unwanted_words:
    df['Reviews'] = df['Reviews'].str.replace(r'\b' + word + r'\b', '', case=False, regex=True)

# Save the updated DataFrame to a new CSV file
output_file_path = r"C:\Users\Elakkiya\Downloads\flipkart\motorola_reviews_final.csv"
df.to_csv(output_file_path, index=False)

print("Data cleaned and saved successfully.")

    


Data cleaned and saved successfully.


In [None]:
'moto':'motorola',lg:life good, cheerzzz:cheers,plz:please,mei:me,worthy:worth,gente:generation,amd:and,lte:light,mid:middle,
awestruck:awesome struck.abt:about,tat:that,dis:this,,gente:generation,v:we,r:are,exppos:exynos,ill:i will,ad:advertisement.realm:realme,
sd:snapdragon,bickering:flickering,ex:example,lcd:liquid crystal display,fyi:for your information,hd:high definition,mio:mimo, even tho:even though,
wowwwwww:wow,redmipocohonor:redmi poco honor,epxiernce:experience,opt:option,jan:january,oct:october,phonemes:phone,portthats:port thats,
battery profane :battery performance,ram:random access memory,rom:read only memory,hasn:has not,spec:specification,nsa:non standalone,
sa:signal antenna,god:good,camarabut:camera but,awsmrest:awesome rest,surfing:suffering,androidmoto:android motorola,
topnotchperformance:top notch performance,tommoro:tomorrow,his friend :after few,avg:average,

In [None]:
'x','p','e','opt','nfc','tm','ir','xnd','ffd','htc','cal','pm','ordo','gh','rn','mia','ip','sd','oct','core','expos','rd','ip','pe','amp','paisa bassol','pc','al','zindabad','dnd','rsk','g','ooooooo',
'mmmmmmm','ggggggg','vfd','havoc','ah','hi','hai','swimming','gop','ji','lajawab','asap loose','ie','h ago'

In [None]:

    'moto': 'motorola', 'lg': 'life good', 'cheerzzz': 'cheers', 'plz': 'please', 'mei': 'me', 'worthy': 'worth', 'gente': 'generation',
    'amd': 'and','lte': 'light','mid': 'middle','awestruck': 'awesome struck','abt': 'about','tat': 'that','dis': 'this','v': 'we',
    'r': 'are','exppos': 'exynos','ill': 'i will','ad': 'advertisement','realm': 'realme','sd': 'snapdragon','bickering': 'flickering',
    'ex': 'example','lcd': 'liquid crystal display','fyi': 'for your information','hd': 'high definition','mio': 'mimo',
    'even tho': 'even though', 'wowwwwww': 'wow', 'redmipocohonor': 'redmi poco honor', 'epxiernce': 'experience', 'opt': 'option',
    'jan': 'january','oct': 'october','phonemes': 'phone','portthats': 'port thats','battery profane': 'battery performance',
    'ram': 'random access memory','rom': 'read only memory','hasn': 'has not','spec': 'specification','nsa': 'non standalone',
    'sa': 'signal antenna', 'god': 'good','camarabut': 'camera but','awsmrest': 'awesome rest','surfing': 'suffering',
    'androidmoto': 'android motorola','topnotchperformance': 'top notch performance','tommoro': 'tomorrow',
    'his friend': 'after few','avg': 'average','knot': 'okay not', 'mod': 'mode', 'pho': 'phone', 'phine': 'phone', 'west': 'waste', 'dayyyyyyyy': 'day', 'parfomes':'performance', 'imp': 'important',
           'swimmer': 'shimmery', 'gam': 'game', 'goodcamra': 'good camera', 'spec': 'specification', 'performancedont': 'performance do not', 'mobileoppof': 'mobile oppo',
           'colitis': 'quality', 'priformence': 'performence', 'ny': 'nice', 'goodmobail': 'good mobile', 'product…': 'product', 'performancegoo': 'performance',
           'superrrrr': 'super', 'coma': 'come', 'ju': 'just', 'gotoofficewithstyle': 'go to office with style', 'supervooc': 'super', 'feb': 'february', 'pais': 'money',
           'lightning': 'lightening', 'lestvice': 'quality best', 'fischer': 'feature', 'slick': 'slim'



In [None]:
{'knot': 'okay not', 'mod': 'mode', 'pho': 'phone', 'phine': 'phone', 'west': 'waste', 'dayyyyyyyy': 'day', 'parfomes':'performance', 'imp': 'important',
           'swimmer': 'shimmery', 'gam': 'game', 'goodcamra': 'good camera', 'spec': 'specification', 'performancedont': 'performance do not', 'mobileoppof': 'mobile oppo',
           'colitis': 'quality', 'priformence': 'performence', 'ny': 'nice', 'goodmobail': 'good mobile', 'product…': 'product', 'performancegoo': 'performance',
           'superrrrr': 'super', 'coma': 'come', 'ju': 'just', 'gotoofficewithstyle': 'go to office with style', 'supervooc': 'super', 'feb': 'february', 'pais': 'money',
           'lightning': 'lightening', 'lestvice': 'quality best', 'fischer': 'feature', 'slick': 'slim']
           
remove_words = ['madathukulam', 'mo', 'fea', 'l', 'ex', 'qu', 'Lotus', 'voodoo', 'ko', 'ont', 'maa', 'ag', 'ne', 'quot','sakuntala', 'rayagada bissamcuttack',
               'gimmick','thi', 'str', 'fatafati', 'mast', 'garcia jharkhand', 'sa', 'isvudn']

In [5]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import emoji

# Load the data
file_path = r"C:\Users\Elakkiya\Downloads\flipkart\google_merged.csv"
df = pd.read_csv(file_path)

# Handle missing values
# Drop rows where the Reviews text is missing
df.dropna(subset=['Review'], inplace=True)

# Ensure all entries in the Reviews column are strings
df['Review'] = df['Review'].astype(str)

# Remove emojis using the emoji library
def remove_emojis(text):
    return emoji.replace_emoji(text, replace='')

# Remove special characters and digits
def remove_special_characters(text):
    return re.sub('[^a-zA-Z\s]', '', text)

# Normalize the text
def normalize_text(text):
    text = text.lower()  # Convert to lowercase
    text = remove_emojis(text)
    text = remove_special_characters(text)
    text = re.sub('\s+', ' ', text)  # Remove extra spaces
    return text

# Apply normalization
df['Cleaned_Review'] = df['Review'].apply(normalize_text)

# Remove stopwords
stop_words = set(stopwords.words('english'))
df['Cleaned_Review'] = df['Cleaned_Review'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

# Apply lemmatization
lemmatizer = WordNetLemmatizer()
df['Cleaned_Review'] = df['Cleaned_Review'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

# Save the cleaned data to a new CSV file
output_file_path = r"C:\Users\Elakkiya\Downloads\flipkart\google_reviews_cleaned.csv"
df.to_csv(output_file_path, index=False)

print("Data cleaning completed and saved to", output_file_path)


Data cleaning completed and saved to C:\Users\Elakkiya\Downloads\flipkart\google_reviews_cleaned.csv


In [6]:
import pandas as pd
from autocorrect import Speller

# Load the data
file_path = r"C:\Users\Elakkiya\Downloads\flipkart\google_reviews_cleaned.csv"
df = pd.read_csv(file_path)# Apply autocorrect using autocorrect library
spell = Speller(lang='en')

def correct_spelling(text):
    return ' '.join([spell(word) for word in text.split()])

df['Autocorrected_Review'] = df['Cleaned_Review'].apply(correct_spelling)

# Save the cleaned data to a new CSV file
output_file_path = r"C:\Users\Elakkiya\Downloads\flipkart\google_reviews_autocorrected.csv"
df.to_csv(output_file_path, index=False)

print("Data cleaning and spell correction completed and saved to", output_file_path)

Data cleaning and spell correction completed and saved to C:\Users\Elakkiya\Downloads\flipkart\google_reviews_autocorrected.csv


In [7]:
import pandas as pd
import re

# Load the cleaned data
file_path = r"C:\Users\Elakkiya\Downloads\flipkart\google_reviews_autocorrected.csv"
df = pd.read_csv(file_path)

# Create a new column 'Reviews' based on 'Autocorrected_Reviews'
df['Reviews'] = df['Autocorrected_Review']

# Dictionary of specific abbreviations and their corrections
abbreviation_dict = {
    'camara': 'camera', 'fps': 'frames per second', 'oplus': 'oneplus', 'sd gen': 'snapdragon generation', 'superb': 'super',
    'professor': 'processor', 'sound college': 'sound quality', 'dam': 'damn', 'opp': 'oppo', 'aws': 'awesome',
    'vry gd': 'very good', 'supppoppp': 'super', 'prosesar': 'processor', 'approx hr sot': 'approximately hour screen on time',
    'u': 'you', 'hr': 'hour', 'prosper': 'processor', 'usa': 'usage', 'good pro': 'good product', 'gd': 'good',
    'nice prod': 'nice product', 'p': 'performance', 'ui': 'user interface', 'day us': 'day usage', 'prosesar': 'processor',
    'flipcart': 'flipkart', 'battery beast': 'battery best', 'ease': 'easy', 'pub': 'pubg', 'osm': 'awesome',
    'worth karma worthuuu': 'worth the money', 'ois': 'optical image stabilization', 'aim happy': 'i am happy',
    'pub cod': 'pubg, call of duty', 'hz': 'hertz', 'degree census': 'degree celsius', 'nd': 'and',
    'hit issue': 'heating issue', 'ai': 'artificial intelligence', 'wil': 'will', 'tax': 'thanks', 'anthem': 'item',
    'mint camera': 'main camera', 'heat throttle': 'heating and throttling', 'ill update': 'i will update',
    'phone devils aswoome': 'phone device is awesome', 'hdr': 'high dynamic range', 'fo': 'for',
    'op ti really': 'oneplus really', 'one plus ph': 'oneplus phone', 'spr': 'super', 'apprehension': 'appreciation',
    'max': 'maximum', 'oxygen o': 'oxygen os', 'kinda': 'kind of', 'medio': 'mediocre', 'btw': 'by the way', 'assam': 'awesome',
    'ph': 'phone', 'bos': 'range boost', 'beast': 'best', 'dlr': 'dslr', 'batter': 'better', 'cod': 'call of duty', 'nyc': 'nice',
    'unvilevebale item': 'unbelievable item', 'supper': 'super', 'op': 'oneplus', 'assume': 'awesome',
    'osm dolly atoms': 'awesome dolby atmos', 'mic failed': 'microphone failed', 'premium paper': 'premium feel', 'ver': 'very',
    'gonna': 'going to', 'pub': 'pubg', 'gun': 'good', 'jus': 'just', 'ive': 'i have', 'extent': 'excellent', 'nice hoon': 'nice phone',
    'gr': 'great', 'math': 'match', 'bt': 'but', 'ok':'okay', 'costa': 'costar', 'beast': 'best', 'hr': 'hour', 'sha': 'should',
    'wk': 'week', "hadnt": "had not", "im": "i am", "havent": "have not", "hasnt": "has not", 'u': "you", 'r': "are",
    "ui": "user interface", "doesnt": "does not", 'rambling': "rumbling", "io": "iphone operating system", 
    "sot": "special operation team", "le": "less", "fhd": "full high definition", "dont": "do not",
    "cam": "camera", "came": "camera", "avg": "average", "yea": "yeah", "lil": "little", 
    "costlier": "more expensive", "it's": "it is", "very": "very", "n": "and", "cant": "cannot", "dis": "this", "v": "we", 
    "hdr": 'high dynamic range', "didnt": "did not", "ive": "i have", "bezel": "bezel", "ur": "your", "wont": 'will not', 
    "hd": 'high definition', 'cleanui': "clean user interface", "tatics": "haptics", "sd": "secure digital", "gen": "generation", 
    "usp": 'unique selling proposition', 'degc': "degree celsius", "tatic": "haptic", "unbuilt": "inbuilt", 'xiomi': "xiaomi",
    'regreating': "regretting", 'fyi': "for your information", 'issuehrs': "issue hours", 'doomed': "zoomed",
    'ois': "optical image stabilization", 'theyll': "they will", 'ig': "instagram", 'bbd': "bigger better deal", 'cemra': "camera", 
    'fastly': "fast", "optimise": 'optimize', 'osum': 'awesome', 'vi': "vodafone", 'upi': "unified payments interface",
    'eyeturner': "eye turner", 'banger': "banger", 're': "resolution", 'goddamn': "goddamn", 'aint': "am not", 'plesently': "pleasantly", 
    'thik': "think", "tooo": "too", "uisvery": 'user interface very', 'gif': "graphics interchange format", 'siz': "six", 
    'costlier': "costlier", 'iphones': "iphone", 'youre': 'you are', 'doubtbut': "doubt but", 'phome': "phone", 'red': "redmi", 
    'okif': "okay if", 'pic': "picture", 'smatter': 'smarter', 'membrane': "ambrane", "holdnew": 'hold new', 'swine': "swipe", 
    'inshot': "inshot", 'pixelated': "pixel related", 'cameraai': "camera artificial intelligence", 'oppos': 'oppo', 'amaze': "amazing", 
    'daytoday': "day to day", 'offmy': "off my", 'laggy': "lag", 'victus': "victus", 'slowlike': "slow like",
    'wholeday': "full day", 'commendable': "recommendable", 'baku': "vaku", 
    'beat': "best", 'surfed': "suffered", 'bgmi': "battle ground mobile india", 'isnt': "is not", 
    'flickering': "flickering", 'least': "at least", 'doun': "down", 'thats': "that is", 'ill': "i will", 'bout': "about", 
    'butter': "better", 'emi': "equated monthly installment", 'prefect': "perfect", 'ketone': "keyone", 
    'ie': "in other words", 'cuz': "because", "youve": "you have", 'ott': "over the top", 
    'oct': "october", 'ois': "optical image stabilization", 'ip': "iphone", 'nowhope': "now hope", 'red': "redmi",
    "eraserunblur": "eraser focus", "wifi": "wireless fidelity", 'ok': "okay", 'hiccup': "hiccup",
    'janso': "january so", 'mahmaybe': "milliampere hour may be", 'lovable': "lovable", 'etc': "et cetera", "karma": "varma",
    'dayyyyyyyy': "day", 'pic': "picture", 'camra': "camera", 'Vry': "very", 'easilllyy': "easily",
    'came': "camera", 'mob': "mobile", 'flipcard': "flipkart", 'Kkk': "okay", 'swimmer': 'shimmery', 'supeeeeer': 'super', 'ur': 'your', 'nais': 'nice', 'descent': 'decent', 'se user': 'super', 'nris': 'nice',
    'suuuuuuuuuuuuuuper': 'super', 'project': 'product', 'xtra': 'extra', 'userniterface': 'user interface', 'prossecer': 'processor', 'amazingest': 'amazing',
    'col': 'color', 'superbbbbb': 'superb', 'goog': 'good', 'censor': 'sensor', 'ram': 'ram', 'wwwhhhoooooooo': 'who', 'pictre': 'picture', 'comera': 'camera', 'fa': 'for',
    'truely': 'truly', 'fonnne': 'phone', 'excellant': 'excellent', 'totallly': 'totally', 'ceramicback': 'ceramic back', 'internat': 'internet', 'iui': 'miui', 'camaera': 'camera', 'supper': 'super',
    'exllent': 'excellent', 'magnafiqui': 'magnificent', 'affecfordble': 'affordable', 'smmmmooooothhhhhh': 'smooth', 'veryg': 'very good', 'camaera': 'camera',
    'ar': 'are', 'eet': 'get', 'june': 'june', 'ster': 'star', 'osam': 'awesome', 'gona': 'gonna', 'suchh': 'such', 'amoled': 'amoled', 'fon': 'phone', 'ram': 'ram', 'exp': 'expensive',
    'xclent': 'excellent', 'ans': 'and', 'shotcut': 'shortcut', 'expoerience': 'experience', 'bgmi': 'battle grounds mobile india', 'anr': 'and', 'whao': 'wow', 'awesomeeee': 'awesome', 'blurr': 'blur',
    'definately': 'definitely', 'suuuuper': 'super', 'nize': 'nice', 'phobe': 'phone', 'thanksflipcart': 'thanks flipkart', 'mimax': 'mi max', 'picure': 'picture', 'blutooth': 'bluetooth', 'qhy': 'why',
    'user interfce': 'user interface', 'beasutiful': 'beautiful', 'smother': 'smoother', 'gooddd': 'good', 'ramgood': 'ram good', 'smother': 'smoother', 'suggestable': 'suggestible', 'gamesro': 'games to',
    'playbles': 'playable', 'fes': 'fest', 'crystalclear': 'crystal clear', 'ph': 'phone', 'frimeware': 'firmware', 'dowload': 'download', 'camara': 'camera', 'gud': 'good', 'jio': 'jio', 'againflipkart': 'again flipkart',
    'imposible': 'impossible', 'comfortbale': 'comfortable', 'whith': 'with', 'chages': 'changes', 'consiser': 'consider', 'heatsup': 'heats up', 'grt': 'great', 'fringerprint': 'fingerprint', 'detailingg': 'detailing',
    'gud': 'good', 'hz': 'hertz', 'nd': 'and', 'soundcollage': 'sound quality', 'wi': 'with', 'appreciateble': 'appreciable', 'phn': 'phone', 'mu': 'music', 'preforance': 'performance', 'veryyy': 'very',
    'ceramicback': 'ceramic back', 'ceraicback': 'ceramic back', 'battrey': 'battery', 'hollywow': 'hollywood', 'expirence': 'experience', 'niceui': 'nice user interface', 'terriffic': 'terrific', 'varien': 'variant',
    'xcellent': 'excellent', 'sorring': 'scoring', 'comfrtable': 'comfortable', 'mics': 'microphones', 'cds': 'cd', 'whot': 'what', 'mindbloing': 'mind blowing', 'gamesome': 'games on', 'fluently': 'fluent',
    'dat': 'that', 'amasing': 'amazing', 'superrr': 'super', 'osm': 'awesome', 'hreat': 'great', 'costefficeint': 'cost efficient', 'bhaskar': 'bhaskaran', 'heatingproblem': 'heating problem',
    'miracast': 'miracast', 'definately': 'definitely', 'wo': 'would', 'thsnks': 'thanks', 'whywhy': 'why why', 'cam': 'camera', 'gaurantee': 'guarantee', 'crazychaging': 'crazy charging',
    'interfece': 'interface', 'excellant': 'excellent', 'intenation': 'internet', 'bettry': 'battery', 'exellent': 'excellent', 'lockscreen': 'lock screen', 'tap': 'tap', 'eb': 'is',
    'cam': 'camera', 'exellent': 'excellent', 'pros': 'pros', 'cus': 'because', 'definatly': 'definitely', 'microphoneon': 'microphone on', 'renaining': 'remaining', 'nxet': 'next',
    'switcher': 'switcher', 'yooo': 'yo', 'exlent': 'excellent', 'vol': 'volume', 'magic': 'magic', 'beacuse': 'because', 'ven': 'even', 'expence': 'expense', 'pones': 'phones',
    'pu': 'pubg', 'exlent': 'excellent', 'bgmi': 'battle ground mobile india', 'athir': 'arshad', 'ramaining': 'remaining', 'docing': 'docking', 'batterylife': 'battery life', 'batt': 'battery', 
    'cas': 'case', 'proble': 'problem', 'sa': 'sa', 'lense': 'lens', 'cameraof': 'camera of', 'parformance': 'performance', 'whatsapp': 'whatsapp', 'camra': 'camera', 'amoleddisplay': 'amoled display',
    'hugh': 'high', 'exellent': 'excellent', 'phon': 'phone', 'boaring': 'boring', 'sync': 'sync', 'flipkart': 'flipkart', 'tp': 'to', 'displayflipkart': 'display flipkart', 'flipcart': 'flipkart',
    'complain': 'complain', 'sdgen': 'snapdragon generation', 'comerabgmi': 'camera battle ground mobile india', 'doint': 'do not', 'iys': 'it is', 'superp': 'super', 'expeeience': 'experience', 'loook': 'look',
    'couple': 'couple', 'phoneflipkart': 'phone flipkart', 'got': 'got', 'consisency': 'consistency', 'dispkay': 'display', 'proc': 'processor', 'gud': 'good', 'clarityclearity': 'clarity', 'octo': 'octa',
    'excessive': 'excessive', 'goodem': 'good em', 'snapdragon': 'snapdragon', 'camara': 'camera', 'sper': 'super', 'rebooting': 'rebooting', 'accessorry': 'accessory', 'usr': 'user', 'dispkay': 'display',
    'pictureclarity': 'picture clarity', 'dispkayclarity': 'display clarity', 'extremely': 'extremely', 'awsome': 'awesome', 'goodcamera': 'good camera', 'goodclatrity': 'good clarity', 'miui': 'miui',
    'clearn': 'clear', 'cllick': 'click', 'reseting': 'resetting', 'psl': 'psl', 'edxtra': 'extra', 'redmy': 'redmi', 'gamy': 'game', 'biometrics': 'biometrics', 'noney': 'money', 'lenovo': 'lenovo',
    'possessing': 'processing', 'ossam': 'awesome', 'tis': 'this', 'surport': 'support', 'awsom': 'awesome', 'supere': 'super', 'serviceflipkart': 'service flipkart', 'awesomeas': 'awesome', 'greate': 'great',
    'mobs': 'mobile', 'sept': 'september', 'its': 'its', 'wasawsm': 'was awesome', 'flipkartgood': 'flipkart good', 'easeofuse': 'ease of use', 'mi': 'mi', 'perfectperfect': 'perfect', 'ohow': 'oh how',
    'dosn': 'does', 'okkay': 'okay', 'awdome': 'awesome', 'des': 'des', 'awesomefeature': 'awesome feature', 'circa': 'circa', 'gud': 'good', 'backca': 'back cover', 'yhe': 'the', 'niceproduct': 'nice product',
    'me':'mi', 'awesomegod': 'awesome', 'tricking': 'tricking', 'allure': 'allure', 'mia1': 'mi a1', 'backpanel': 'back panel', 'microwave': 'microphone', 'fri': 'friend', 'frusterating': 'frustrating','moto':'motorola',
    'rs': 'rs', 'chat': 'chat', 'saiz': 'size', 'ram': 'ram', 'workswll': 'works well', 'mid': 'mi', 'ph': 'phone', 'flpkrt': 'flipkart', 'blowming': 'blowing', 'complaince': 'complain', 'ar': 'are', 'phn': 'phone',
    'usb': 'usb', 'coverback': 'cover back', 'awesomecamera': 'awesome camera', 'osame': 'awesome', 'bat': 'battery', 'weighing': 'weighing', 'midrange': 'mid range', 'blowng': 'blowing', 'yoursite': 'your site',
    'superbg': 'superb', 'intetnet': 'internet', 'besutiful': 'beautiful', 'suuuper': 'super', 'ate': 'ate', 'ceramicback': 'ceramic back', 'velvet': 'velvet', 'goog': 'good', 'doeswork': 'does work', 'interrface': 'interface',
    'spellings': 'spellings', 'competetion': 'competition', 'clarityis': 'clarity is', 'xclent': 'excellent', 'crome': 'chrome', 'sinply': 'simply', 'cermaic': 'ceramic', 'plz': 'please', 'compability': 'compatibility',
    'excellnt': 'excellent', 'iui': 'miui', 'ultraa': 'ultra', 'whatsoever': 'whatsoever', 'goodand': 'good and', 'gotcamera': 'got camera', 'ple': 'please', 'afortable': 'affordable', 'kns': 'kms', 'nicefeature': 'nice feature',
    'redemi': 'redmi', 'xxent': 'excellent', 'cuttrt': 'cutout', 'mi': 'mi', 'serires': 'series', 'whish': 'wish', 'dlexo': 'deluxe', 'cristal': 'crystal', 'makesense': 'make sense', 'gud': 'good', 'expellience': 'experience',
    'reflectiion': 'reflection', 'microphone': 'microphone', 'cameraquality': 'camera quality', 'awesomef': 'awesome', 'guddisplay': 'good display', 'frustrting': 'frustrating', 'vol': 'volume', 'cermaic': 'ceramic', 'clearence': 'clearance', 
    'snapdragon': 'snapdragon', 'ceramicback': 'ceramic back', 'cermaicback': 'ceramic back', 'snapdragon820': 'snapdragon 820', 'miui8': 'miui 8', 'snapdragon': 'snapdragon', 'snapdragon': 'snapdragon', 'snapdragon835': 'snapdragon 835', 'snapdragon820': 'snapdragon 820'
}
# Replace abbreviations in the Reviews column
for abbrev, full_form in abbreviation_dict.items():
    # Use word boundaries to ensure whole words are replaced
    df['Reviews'] = df['Reviews'].str.replace(r'\b' + abbrev + r'\b', full_form, case=False, regex=True)

unwanted_words = [
    'euuuuuuuuuuuuuu', 'imei', 'xx', 'o', 'p', 'k', 'x', 'w', 'st', 'isbn', 'gb', 'min', 'mah', 'x x x', 'makhan',
    'bim', 'rd', 'r', 'n', 'nd', 'mm', 'fish', 'apple ka bar', 'mp', 'ppi', 'yrr', 'imei', 'k', 'science', 'x', 'nd',
    'rd', 'le', 'st', 'p', 'pm', 'f', 'tho', 'h', 'th', 'gn', 'xr', 'xl', 'am', 'tg', 'p', 'z', 'ce', 'ip', 'lt', 'td',
    'gt', 'lea', 'la', 'der', 'ir', 'j', 'sp', 'th', 'v', 'cg', 'wee', 'seg', 'g', 'pro', 'x', 'p', 'e', 'opt', 'nfc',
    'tm', 'ir', 'xnd', 'ffd', 'htc', 'cal', 'pm', 'ordo', 'gh', 'rn', 'mia', 'ip', 'sd', 'oct', 'core', 'expos', 'rd',
    'ip', 'pe', 'amp', 'paisa bassol', 'pc', 'al', 'zindabad', 'dnd', 'rsk', 'g', 'ooooooo', 'mmmmmmm', 'ggggggg', 'vfd',
    'havoc', 'ah', 'hi', 'hai', 'swimming', 'gop', 'ji', 'lajawab', 'asap loose', 'ie', 'h ago', 'madathukulam', 'mo', 
    'fea', 'l', 'ex', 'qu', 'Lotus', 'voodoo', 'ko', 'ont', 'maa', 'ag', 'ne', 'quot', 'sakuntala', 'rayagada bissamcuttack',
    'gimmick', 'thi', 'str', 'fatafati', 'mast', 'garcia jharkhand', 'sa', 'isvudn', 'x', 'sg', 'hijab', 'tbh', 'wars', 
    'nts', 'huawei', 'keirin', 'ofc', 'pg', 'sec', 'jaar', 'hai ye', 'th', 'ft', 'emu', 'gpu', 'fr', 'nit', 'eis', 'oo',
    'hh', 'ly', 'fei', 'q', 'c', 'sw', 'kd', 'ofc', 'bl', 'lol', 'et', 'rgh', 'extremewwe', 'dts', 'iu', 'vr', 'Soo', 
    'al', 'sm', 'rsk', 'eta', 'gh', 'nu', 'ota', 'fp'
]
# Remove unwanted words
for word in unwanted_words:
    df['Reviews'] = df['Reviews'].str.replace(r'\b' + word + r'\b', '', case=False, regex=True)

# Save the updated DataFrame to a new CSV file
output_file_path = r"C:\Users\Elakkiya\Downloads\flipkart\google_reviews_final.csv"
df.to_csv(output_file_path, index=False)

print("Data cleaned and saved successfully.")

    


Data cleaned and saved successfully.


In [13]:
import pandas as pd
import glob

# List of file paths to combine
file_paths = [
    r"C:\Users\Elakkiya\Downloads\flipkart\motorola_reviews_final.csv",
    r"C:\Users\Elakkiya\Downloads\flipkart\oneplus_reviews_final.csv",
    r"C:\Users\Elakkiya\Downloads\flipkart\Honor_reviews_final.csv",
    r"C:\Users\Elakkiya\Downloads\flipkart\google_reviews_final.csv",
    r"C:\Users\Elakkiya\Downloads\flipkart\oppo_reviews_final.csv",
]

# Initialize an empty list to hold DataFrames
dfs = []

# Read each file and append the DataFrame to the list
for file_path in file_paths:
    df = pd.read_csv(file_path)
    dfs.append(df)

# Concatenate all DataFrames into a single DataFrame
combined_df = pd.concat(dfs, ignore_index=True)

# Drop rows with missing values
combined_df.dropna(inplace=True)

# Drop specified columns
columns_to_drop = ['Autocorrected_Review', 'Product_Link', 'Product_Price', 'Cleaned_Review', 'Review']
combined_df.drop(columns=columns_to_drop, inplace=True, errors='ignore')

# Rename the specified columns
columns_to_rename = {'Rating': 'Ratings'}
combined_df.rename(columns=columns_to_rename, inplace=True)

# Count the number of duplicate rows
duplicate_count = combined_df.duplicated(keep=False).sum()
print(f"Number of duplicate rows: {duplicate_count}")

# Drop duplicate rows
combined_df.drop_duplicates(inplace=True)

# Save the combined DataFrame to a new CSV file
output_file_path = r"C:\Users\Elakkiya\Downloads\flipkart\Flipkart_Products_reviews.csv"
combined_df.to_csv(output_file_path, index=False)

print("Combined DataFrame created successfully and saved.")


Number of duplicate rows: 1005
Combined DataFrame created successfully and saved.


In [25]:
import pandas as pd
df=pd.read_csv(r"C:\Users\Elakkiya\Downloads\flipkart\Flipkart_Products_reviews.csv")
df

Unnamed: 0,Product_Name,Ratings,Reviews
0,"MOTOROLA Edge 50 (Peach Fuzz, 256 GB)",4.0,crisp display camera performed way better exp...
1,"MOTOROLA Edge 50 (Peach Fuzz, 256 GB)",4.0,one day review awesome design awesome camera ...
2,"MOTOROLA Edge 50 (Peach Fuzz, 256 GB)",5.0,could recommend best price range clean user in...
3,"MOTOROLA Edge 50 (Peach Fuzz, 256 GB)",4.0,camera best mobile phone heating much
4,"MOTOROLA Edge 50 (Peach Fuzz, 256 GB)",5.0,phone come ram boost get turning get hand make...
...,...,...,...
9520,"OPPO Reno11 Pro 5G (Rock Grey, 256 GB)",5.0,good phone love
9521,"OPPO Reno11 Pro 5G (Rock Grey, 256 GB)",5.0,great look
9522,"OPPO Reno11 Pro 5G (Rock Grey, 256 GB)",1.0,front camera good lost money
9523,"OPPO Reno11 Pro 5G (Rock Grey, 256 GB)",5.0,super


In [16]:
df['Product_ID'] = pd.factorize(df['Product_Name'])[0] + 1

In [4]:
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk

# Step 1: Download VADER lexicon (only needed once)
# nltk.download('vader_lexicon')

# Step 2: Load your DataFrame from the CSV file
input_file_path = r"C:\Users\Elakkiya\Downloads\flipkart\Flipkart_Products_reviews.csv"
df = pd.read_csv(input_file_path)

# Creating a Product_ID column by factorizing Product_Name
df['Product_ID'] = pd.factorize(df['Product_Name'])[0] + 1

# Step 3: Initialize VADER Sentiment Analyzer
sia = SentimentIntensityAnalyzer()

# Step 4: Define a function to get sentiment scores
def get_sentiment(review):
    sentiment = sia.polarity_scores(review)
    return sentiment['compound']  # Returning the compound score

# Step 5: Apply the function to the Review column
df['Sentiment_Score'] = df['Reviews'].apply(get_sentiment)

# Step 6: Classify sentiment as Positive, Negative, or Neutral
df['Sentiment'] = df['Sentiment_Score'].apply(
    lambda score: 'Positive' if score > 0.05 else ('Negative' if score < -0.05 else 'Neutral')
)

# Step 7: Save the DataFrame to a CSV file
output_file_path = "Flipkart_Products_reviews_with_sentiment.csv"
df.to_csv(output_file_path, index=False)

print(f'Sentiment analysis results saved to {output_file_path}')


Sentiment analysis results saved to Flipkart_Products_reviews_with_sentiment.csv


In [2]:
df=pd.read_csv(r"C:\Users\Elakkiya\Downloads\flipkart\Flipkart_Products_reviews_with_sentiment.csv")


In [31]:
df[df['Sentiment'] == 'Negative']

Unnamed: 0,Product_Name,Ratings,Reviews,Sentiment_Score,Sentiment
26,"MOTOROLA Edge 50 (Peach Fuzz, 256 GB)",3.0,photo quality good camera app lag slow irritating battery average does not last long enough fast charging also mark phone tremendous heating issue get heated charging video recording video calling taking picture also running app display phone really amazing disappointed,-0.1987,Negative
31,"MOTOROLA Edge 50 (Peach Fuzz, 256 GB)",3.0,battery drainage quickly and heating problem,-0.4019,Negative
72,"MOTOROLA Edge 50 (Koala Grey, 256 GB)",3.0,photo quality good camera app lag slow irritating battery average does not last long enough fast charging also mark phone tremendous heating issue get heated charging video recording video calling taking picture also running app display phone really amazing disappointed,-0.1987,Negative
77,"MOTOROLA Edge 50 (Koala Grey, 256 GB)",3.0,battery drainage quickly and heating problem,-0.4019,Negative
332,"Motorola Edge 50 Fusion (Forest Green, 256 GB)",5.0,first time us motorola smartphones go without hesitation every thing mark price segment ignore negative comment,-0.6582,Negative
338,"Motorola Edge 50 Fusion (Forest Green, 128 GB)",5.0,received defective product speaker working flipkart denied return gave motorola service centre responding suggested pls pls buy electronic item flipkart,-0.6369,Negative
412,"MOTOROLA Edge 50 (Jungle Green, 256 GB)",3.0,photo quality good camera app lag slow irritating battery average does not last long enough fast charging also mark phone tremendous heating issue get heated charging video recording video calling taking picture also running app display phone really amazing disappointed,-0.1987,Negative
417,"MOTOROLA Edge 50 (Jungle Green, 256 GB)",3.0,battery drainage quickly and heating problem,-0.4019,Negative
446,"Motorola Edge 50 Pro 5G with 125W Charger (Caneel Bay, 256 GB)",5.0,phone good flipkart service worse,-0.0516,Negative
496,"Motorola Edge 50 Pro 5G with 125W Charger (Vanilla Cream, 256 GB)",5.0,phone good flipkart service worse,-0.0516,Negative


In [30]:
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [5]:
import pandas as pd

# Step 1: Load your DataFrame from the CSV file
input_file_path = r"C:\Users\Elakkiya\Downloads\flipkart\Flipkart_Products_reviews_with_sentiment.csv"
df = pd.read_csv(input_file_path)

# Step 2: Aggregate average sentiment scores by Product_Name
product_recommendations = df.groupby('Product_Name').agg({
    'Product_ID': 'first',           # Get the first Product_ID for each product
    'Sentiment': 'first',            # Get the first Sentiment for each product
    'Sentiment_Score': 'mean',       # Average sentiment score
    'Ratings': 'mean',               # Average rating
    'Reviews': 'count'               # Count of reviews
}).reset_index()

# Step 3: Sort the products by average sentiment score
product_recommendations = product_recommendations.sort_values(by='Sentiment_Score', ascending=False)

# Step 4: Save the resulting DataFrame to a CSV file
output_path = r"C:\Users\Elakkiya\Downloads\flipkart\Flipkart_mobile_recommendation.csv"
product_recommendations.to_csv(output_path, index=False)

print(f"Product recommendations saved to {output_path}")


Product recommendations saved to C:\Users\Elakkiya\Downloads\flipkart\Flipkart_mobile_recommendation.csv


In [32]:
df=pd.read_csv(r"C:\Users\Elakkiya\Downloads\flipkart\Flipkart_mobile_recommendation.csv")
df

Unnamed: 0,Product_Name,Sentiment_Score,Ratings,Reviews
0,"Honor 7 (Mystery Grey, 16 GB)",0.98724,4.6,10
1,"Honor 7 (Fantasy Silver, 16 GB)",0.98724,4.6,10
2,"MOTOROLA Moto X (2nd Generation) (Black Leather, 16 GB)",0.9226,4.38,50
3,"Honor 10 (Midnight Black, 128 GB)",0.862016,4.8,100
4,"Honor 10 (Phantom Blue, 128 GB)",0.862016,4.8,100
5,"Honor 8 Pro (Navy Blue, 128 GB)",0.841794,4.75,100
6,"Honor 8 Pro (Midnight Black, 128 GB)",0.841794,4.75,100
7,"Motorola Edge 20 5G (Frosted Emerald, 128 GB)",0.822806,4.68,50
8,"Motorola Edge 20 5G (Frosted Onyx, 128 GB)",0.822806,4.68,50
9,"Motorola Edge 20 5G (Frosted Pearl, 128 GB)",0.822806,4.68,50
