# Exploration   
  
In this document, I'll explore the problem interactively.

In [1]:
import pandas as pd

In [2]:
# Read the text file into a dataframe
df = pd.read_csv('input.txt', header=None, names=['Text'])

# Display the dataframe
df


Unnamed: 0,Text
0,gsjgklneight6zqfz
1,7one718onegfqtdbtxfcmd
2,xvtfhkm8c9
3,914two8
4,vxzzvdhfqfsix83c1ttvbbstxgdrkfcnmm3
...,...
995,5onesixsevenphxtmlqhzfcjxrknpv
996,gldsixrhss186seven6
997,gnpksz4
998,4919


In [3]:
# Add a column to the dataframe that indicates if the text contains at least two digits
df['ContainsDigits'] = df['Text'].str.contains(r'\d.*\d')
df

Unnamed: 0,Text,ContainsDigits
0,gsjgklneight6zqfz,False
1,7one718onegfqtdbtxfcmd,True
2,xvtfhkm8c9,True
3,914two8,True
4,vxzzvdhfqfsix83c1ttvbbstxgdrkfcnmm3,True
...,...,...
995,5onesixsevenphxtmlqhzfcjxrknpv,False
996,gldsixrhss186seven6,True
997,gnpksz4,False
998,4919,True


In [4]:
def get_coordinate(row):
    if row['ContainsDigits']:
        digits = [char for char in row['Text'] if char.isdigit()]
        if digits:
            return digits[0] + digits[-1]
    return ''

df['Coordinate'] = df.apply(get_coordinate, axis=1)
df


Unnamed: 0,Text,ContainsDigits,Coordinate
0,gsjgklneight6zqfz,False,
1,7one718onegfqtdbtxfcmd,True,78
2,xvtfhkm8c9,True,89
3,914two8,True,98
4,vxzzvdhfqfsix83c1ttvbbstxgdrkfcnmm3,True,83
...,...,...,...
995,5onesixsevenphxtmlqhzfcjxrknpv,False,
996,gldsixrhss186seven6,True,16
997,gnpksz4,False,
998,4919,True,49


In [5]:
# Check if there are any rows that don't contain any digits
no_digits_rows = not df['Text'].str.contains(r'\d').all()
no_digits_rows


False

In [6]:
def get_coordinate(row):
    if not row['ContainsDigits']:
        digits = [char for char in row['Text'] if char.isdigit()]
        if digits:
            return int(f"{digits[0]}{digits[0]}")
    return row['Coordinate']  # Keep the existing value if present

df['Coordinate'] = df.apply(get_coordinate, axis=1)
df


Unnamed: 0,Text,ContainsDigits,Coordinate
0,gsjgklneight6zqfz,False,66
1,7one718onegfqtdbtxfcmd,True,78
2,xvtfhkm8c9,True,89
3,914two8,True,98
4,vxzzvdhfqfsix83c1ttvbbstxgdrkfcnmm3,True,83
...,...,...,...
995,5onesixsevenphxtmlqhzfcjxrknpv,False,55
996,gldsixrhss186seven6,True,16
997,gnpksz4,False,44
998,4919,True,49


In [7]:
coordinate_types = df['Coordinate'].apply(type)
print(coordinate_types)



0      <class 'int'>
1      <class 'str'>
2      <class 'str'>
3      <class 'str'>
4      <class 'str'>
           ...      
995    <class 'int'>
996    <class 'str'>
997    <class 'int'>
998    <class 'str'>
999    <class 'str'>
Name: Coordinate, Length: 1000, dtype: object


In [8]:
df['Coordinate'] = df['Coordinate'].astype(int)


In [9]:
df.Coordinate.sum()

55108

This answer is too low according to the site. Time to find out where did I make a mistake.

In [10]:
for index, row in df.iterrows():
    print(row)


Text              gsjgklneight6zqfz
ContainsDigits                False
Coordinate                       66
Name: 0, dtype: object
Text              7one718onegfqtdbtxfcmd
ContainsDigits                      True
Coordinate                            78
Name: 1, dtype: object
Text              xvtfhkm8c9
ContainsDigits          True
Coordinate                89
Name: 2, dtype: object
Text              914two8
ContainsDigits       True
Coordinate             98
Name: 3, dtype: object
Text              vxzzvdhfqfsix83c1ttvbbstxgdrkfcnmm3
ContainsDigits                                   True
Coordinate                                         83
Name: 4, dtype: object
Text              76mkvhmbkpm
ContainsDigits           True
Coordinate                 76
Name: 5, dtype: object
Text              8sixssmlzlhrnineggmrvg6
ContainsDigits                       True
Coordinate                             86
Name: 6, dtype: object
Text              threeninedtr7219
ContainsDigits                

Text              7three9ftpvvbzpjzsixssxbqcfsix4
ContainsDigits                               True
Coordinate                                     74
Name: 374, dtype: object
Text              92eightlsgrmpqtpptxrdfxthreemvlxfpsevenoneightdd
ContainsDigits                                                True
Coordinate                                                      92
Name: 375, dtype: object
Text              eight7three
ContainsDigits          False
Coordinate                 77
Name: 376, dtype: object
Text              drxxcghbsevenninenineljtczntp6
ContainsDigits                             False
Coordinate                                    66
Name: 377, dtype: object
Text              nine55cpqfkdrthree3
ContainsDigits                   True
Coordinate                         53
Name: 378, dtype: object
Text              7fivefp3eight
ContainsDigits             True
Coordinate                   73
Name: 379, dtype: object
Text              97nine1ninexprdpvqzps
ContainsDigi

The third entry is incorrect already. The text: xvtfhkm8c9 contains at least two digits, but the corresponding entry in the column is False. Time to find out why the regex failed.

In [11]:
df['Text'].str.contains(r'\d{2,}')[13]

False

In [12]:
df["Text"]

0                        gsjgklneight6zqfz
1                   7one718onegfqtdbtxfcmd
2                               xvtfhkm8c9
3                                  914two8
4      vxzzvdhfqfsix83c1ttvbbstxgdrkfcnmm3
                      ...                 
995         5onesixsevenphxtmlqhzfcjxrknpv
996                    gldsixrhss186seven6
997                                gnpksz4
998                                   4919
999                                  pbc19
Name: Text, Length: 1000, dtype: object

I think I got it. The regex fails instances when the two digits are not next to each other, and classifies them as false. I will have to correct this. I'm thinking about replacing the regex with simply checking how long the digits list is.

In [13]:
df.Coordinate.sum()

55108