### ***Libraries***

In [267]:
import pandas as pd
import zipfile as zf
import os
import glob
import fileinput

#### ***1. Unzip files*** 

In [268]:
# Ruta del archivo
file_route = 'data.zip'

# Elección de carpeta de destino
destination_file = 'data'

with zf.ZipFile(file_route, 'r') as zip_ref:
    zip_ref.extractall(destination_file)

#### ***2. Reading the files***

In [269]:
sequence = []

elements = os.listdir(destination_file)
for element in elements[:2]:
    files = os.listdir(f"{destination_file}/{element}")
    # print(files)
    for file in files[1:]:
        # print(file)
        filenames = glob.glob(f"{destination_file}/{element}/{file}/*") # ===> it reads all file's documents 
        with fileinput.input(files=filenames) as f:
            for line in f:
                sequence.append((fileinput.filename(), line))
                
sequence

[('data/test/negative\\0000.txt',
  'Jan. 6 -- Ford is struggling in the face of slowing truck and SUV sales and a surfeit of up-to-date , gotta-have cars '),
 ('data/test/negative\\0001.txt',
  '( ADPnews ) - Feb 3 , 2010 - Finland-based steel maker Rautaruukki Oyj ( HEL : RTRKS ) , or Ruukki , said today it slipped to a larger-than-expected pretax loss of EUR 46 million ( USD 64.5 m ) in the fourth quarter of 2009 from '),
 ('data/test/negative\\0002.txt',
  'In Q2 of 2009 , profit before taxes amounted to EUR 13.6 mn , down from EUR 26.8 mn in Q2 of 2008 '),
 ('data/test/negative\\0003.txt',
  'ADPnews - Jul 17 , 2009 - Finland-based steel maker Rautaruukki Oyj Ruukki HEL : RTRKS said today it slipped to a net loss of EUR 184 million USD 259.7 m for the first half of 2009 from a net profit of EUR '),
 ('data/test/negative\\0004.txt',
  "Salonen added that data shows producers ' pulp inventories in North America are declining . "),
 ('data/test/negative\\0005.txt',
  'In the third qu

#### ***3. Writing list in a ".txt" file***

In [270]:
# file name .txt
doc_name = 'Archivo.txt'

with open(doc_name, 'w') as arc:
    for tuple in sequence:
        arc.writelines(f"{tuple[0]}/t{tuple[1]}")
        
print('Contenido creado !!')

Contenido creado !!


#### ***4. Convert tuples in different list within "sequence" list***

In [271]:
text_lines = []

''' Forma de realizarlo con list_comprehenssion'''

# text_lines = [[elem for subelem in tuplas for elem in subelem.split('/')] for tuplas in sequence]

# text_lines

''' Forma de realizarlo con for anidados'''

# for tupla in sequence:
#     list_temp = []
    
#     for element in tupla:        
#         list_temp.extend(element.split("/"))
#         #print(list_temp)
        
         
#     text_lines.append(list_temp)
#     #print(text_lines)  
    
text_lines = [list(tupla) for tupla in sequence]

text_lines
    


[['data/test/negative\\0000.txt',
  'Jan. 6 -- Ford is struggling in the face of slowing truck and SUV sales and a surfeit of up-to-date , gotta-have cars '],
 ['data/test/negative\\0001.txt',
  '( ADPnews ) - Feb 3 , 2010 - Finland-based steel maker Rautaruukki Oyj ( HEL : RTRKS ) , or Ruukki , said today it slipped to a larger-than-expected pretax loss of EUR 46 million ( USD 64.5 m ) in the fourth quarter of 2009 from '],
 ['data/test/negative\\0002.txt',
  'In Q2 of 2009 , profit before taxes amounted to EUR 13.6 mn , down from EUR 26.8 mn in Q2 of 2008 '],
 ['data/test/negative\\0003.txt',
  'ADPnews - Jul 17 , 2009 - Finland-based steel maker Rautaruukki Oyj Ruukki HEL : RTRKS said today it slipped to a net loss of EUR 184 million USD 259.7 m for the first half of 2009 from a net profit of EUR '],
 ['data/test/negative\\0004.txt',
  "Salonen added that data shows producers ' pulp inventories in North America are declining . "],
 ['data/test/negative\\0005.txt',
  'In the third qu

##### ***4.1 Validation of list dimension***

In [272]:
num_rows = len(text_lines)
num_columns = max(len(lines) for lines in text_lines)

print(num_rows)
print(num_columns)

2264
2


#### ***5. Create a dataframe***

In [273]:
#columnas = ['data_file', 'class_file','sentiment','phrase']

df = pd.DataFrame(
    text_lines,
    #index = text_lines[0],
    columns = ['directory','phrase']
)

# df.to_csv('prueba.csv', index=True)
df

Unnamed: 0,directory,phrase
0,data/test/negative\0000.txt,Jan. 6 -- Ford is struggling in the face of sl...
1,data/test/negative\0001.txt,"( ADPnews ) - Feb 3 , 2010 - Finland-based ste..."
2,data/test/negative\0002.txt,"In Q2 of 2009 , profit before taxes amounted t..."
3,data/test/negative\0003.txt,"ADPnews - Jul 17 , 2009 - Finland-based steel ..."
4,data/test/negative\0004.txt,Salonen added that data shows producers ' pulp...
...,...,...
2259,data/train/positive\0453.txt,The company said that paper demand increased i...
2260,data/train/positive\0454.txt,Outotec said it won new orders worth 492.9 mln...
2261,data/train/positive\0455.txt,`` The number of collection errors fell consid...
2262,data/train/positive\0456.txt,Both operating profit and turnover for the six...


#### ***6. Create columns within dataframe***

In [274]:
new_columns = df['directory'].str.split('/', expand=True)
new_columns.columns = ['data','class','senti_text']

new_columns_2 = new_columns['senti_text'].str.split('\\', expand=True)
new_columns_2.columns = ['sentiment', 'text']

#new_columns.head()
#new_columns_2.head()

new_columns_2['text'] = new_columns_2['text'].str.replace('.txt', '')

df = pd.concat([df, new_columns, new_columns_2], axis=1)
df = df.drop(columns=['data','directory','senti_text','text'])

df




Unnamed: 0,phrase,class,sentiment
0,Jan. 6 -- Ford is struggling in the face of sl...,test,negative
1,"( ADPnews ) - Feb 3 , 2010 - Finland-based ste...",test,negative
2,"In Q2 of 2009 , profit before taxes amounted t...",test,negative
3,"ADPnews - Jul 17 , 2009 - Finland-based steel ...",test,negative
4,Salonen added that data shows producers ' pulp...,test,negative
...,...,...,...
2259,The company said that paper demand increased i...,train,positive
2260,Outotec said it won new orders worth 492.9 mln...,train,positive
2261,`` The number of collection errors fell consid...,train,positive
2262,Both operating profit and turnover for the six...,train,positive


#### ***7. Build dataframes for exercise***

In [275]:
#df_train_csv = df.groupby('class').get_group('train').reset_index()
df_train_csv = df[df['class'] =='train']
df_test_csv = df[df['class'] == 'test']

df_train_csv = df_train_csv.drop(columns=['class'])
df_test_csv = df_test_csv.drop(columns=['class'])

# df_train_csv.set_index('phrase')
# df_test_csv.set_index('phrase')

print(df_train_csv)
print(df_test_csv)

df_train_csv.to_csv('train_dataset.csv', index=False)
df_test_csv.to_csv('test_dataset.csv', index=False)

#df_train_csv

                                                 phrase sentiment
453   The real estate company posted a net loss of +...  negative
454   The EU Commission said earlier it had fined Th...  negative
455   Cargo traffic fell 1 % year-on-year to 8,561 t...  negative
456   Finnish media group Talentum has issued a prof...  negative
457   Net profit fell by almost half to +Ã¯Â¿Â½ 5.5 ...  negative
...                                                 ...       ...
2259  The company said that paper demand increased i...  positive
2260  Outotec said it won new orders worth 492.9 mln...  positive
2261  `` The number of collection errors fell consid...  positive
2262  Both operating profit and turnover for the six...  positive
2263  Commission income rose by 25.7 % to EUR 16.1 m...  positive

[1811 rows x 2 columns]
                                                phrase sentiment
0    Jan. 6 -- Ford is struggling in the face of sl...  negative
1    ( ADPnews ) - Feb 3 , 2010 - Finland-based ste..

#### ***8. Try code***

In [278]:
df_t = pd.read_csv('train_dataset.csv')
print(df_t.columns[1])


sentiment


In [None]:
# df_train_csv = df_train_csv.groupby('sentiment')['phrase'].count()
# df_test_csv = df_test_csv.groupby('sentiment')['phrase'].count()
# print(df_train_csv)
# print(df_test_csv)

