In [1]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use(style = 'seaborn')
from sklearn.model_selection import train_test_split

In [2]:
txt = 'Use of python in Machine Learning'
x = re.findall('in', txt)
print('1: ', x)


1:  ['in', 'in', 'in']


In [3]:
txt = 'Python is one of the most popular languages around the world'
searchObj = re.search('\s', txt)
print('2: The first white space character is located in the position: ', searchObj.start())

2: The first white space character is located in the position:  6


In [5]:
string = "Python is one of the most popular languages around the world"
searchObj=re.split("\s",string)
print("3: ",searchObj)

3:  ['Python', 'is', 'one', 'of', 'the', 'most', 'popular', 'languages', 'around', 'the', 'world']


In [6]:
string = 'Python is one of the most popular languages around the world'
searchObj = re.sub('\s', '_', string)
print('4: ', searchObj)

4:  Python_is_one_of_the_most_popular_languages_around_the_world


In [7]:
string = 'Python is one of the most popular languages around the world'
searchObj = re.search(r'\bP\w+', string)
print('5: ', searchObj)

5:  <re.Match object; span=(0, 6), match='Python'>


In [8]:
txt = 'The rain in Spain'
x = re.findall('[a-m]', txt)
print('6: ', x)


6:  ['h', 'e', 'a', 'i', 'i', 'a', 'i']


In [9]:
txt = 'hello world'
x = re.findall('he.*o', txt)
print('7: ', x)

7:  ['hello wo']


In [10]:
def editDistance(str1, str2, m, n):
  if m == 0:
    return n
  if n == 0:
    return m
  if str1[m - 1] == str2[n - 1]:
    return editDistance(str1, str2, m - 1, n - 1)
  return 1 + min(editDistance(str1, str2, m, n - 1), editDistance(str1, str2, m - 1, n), 1 + editDistance(str1, str2, m - 1, n - 1))

str1 = 'horse'
str2 = 'ros'
print('Minimum edit distance(recursive approach): ', editDistance(str1, str2, len(str1), len(str2)))

Minimum edit distance(recursive approach):  4


In [11]:
def editDistance(str1, str2):
  len1 = len(str1)
  len2 = len(str2)
  dp = [[-1] * (len1 + 1) for i in range(len2 + 1)]
  for i in range(len1 + 1):
    dp[0][1] = i
  for i in range(len2 + 1):
    dp[i][0] = i
  for j in range(1, len2 + 1):
    for i in range(1, len1 + 1):
      if str1[i - 1] == str2[j - 1]:
        dp[j][i] = dp[j - 1][i - 1]
      else:
        insert = 1 + dp[j - 1][i]
        delete = 1 + dp[j][i - 1]
        replace = 2 + dp[j - 1][i - 1]
        dp[j][i] = min(insert, delete, replace)
  return dp[-1][-1]

str1 = 'horse'
str2 = 'ros'
print('Minimum edit distance(dynamic programming approach): ', editDistance(str1, str2))

Minimum edit distance(dynamic programming approach):  1


In [12]:
df = pd.read_csv('all-data.csv', encoding = 'ISO-8859-1')
df.columns = ['sentiment', 'news']
df.head()

Unnamed: 0,sentiment,news
0,neutral,Technopolis plans to develop in stages an area...
1,negative,The international electronic industry company ...
2,positive,With the new production plant the company woul...
3,positive,According to the company 's updated strategy f...
4,positive,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4845 entries, 0 to 4844
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentiment  4845 non-null   object
 1   news       4845 non-null   object
dtypes: object(2)
memory usage: 75.8+ KB


In [14]:
df[df['sentiment'].isna() == False].count()

sentiment    4845
news         4845
dtype: int64

In [15]:
df['sentiment'].value_counts()

neutral     2878
positive    1363
negative     604
Name: sentiment, dtype: int64

In [16]:
y = df['sentiment'].values
y.shape

(4845,)

In [17]:
y

array(['neutral', 'negative', 'positive', ..., 'negative', 'negative',
       'negative'], dtype=object)

In [18]:
x = df['news'].values
x.shape

(4845,)

In [19]:
(x_train, x_test, y_train, y_test) = train_test_split(x, y, test_size = 0.4)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(2907,)
(1938,)
(2907,)
(1938,)


In [20]:
df1 = pd.DataFrame(x_train)
df1 = df1.rename(columns = {0: 'news'})

In [21]:
df2 = pd.DataFrame(y_train)
df2 = df2.rename(columns = {0: 'sentiment'})
df_train = pd.concat([df1, df2], axis = 1)

In [22]:
df3 = pd.DataFrame(x_test)
df3 = df3.rename(columns = {0: 'news'})

In [23]:
df4 = pd.DataFrame(y_test)
df4 = df4.rename(columns = {0: 'sentiment'})
df_test = pd.concat([df3, df4], axis = 1)

In [24]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [25]:
def remove_punctuation(text):
  if(type(text) == float):
    return text
  ans = ''
  for i in text:
    if i not in string.punctuation:
      ans = ans + i
  return ans

In [26]:
df_train['news'] = df_train['news'].apply(lambda x: remove_punctuation(x))
df_test['news'] = df_test['news'].apply(lambda x: remove_punctuation(x))

In [None]:
df_train.head()